From ceb744eb2fa0895db1526110462745962fdf43c0 Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Wed, 13 Mar 2024 12:08:39 +0000 Subject: [AMDGPU] Fix canonicalization of truncated values. (#83054) We were relying on roundings to implicitly canonicalize, which is generally safe, except with roundings that may be optimized away. Fixes #82937. --- llvm/test/CodeGen/AMDGPU/bf16.ll | 911 +++++---------------- llvm/test/CodeGen/AMDGPU/clamp.ll | 64 +- .../CodeGen/AMDGPU/fcanonicalize-elimination.ll | 103 +-- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 728 +++++++++------- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 22 +- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 22 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 2 - llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 122 ++- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 122 ++- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 191 +++-- 10 files changed, 914 insertions(+), 1373 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ebb77c1..9865883 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3 ; GCN-NEXT: v_min_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v2, v2, v5 ; GCN-NEXT: v_min_f32_e32 v1, v1, v4 ; GCN-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7 ; GCN-NEXT: v_min_f32_e32 v2, v2, v6 ; GCN-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 @@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_min_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3 ; GCN-NEXT: v_max_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v2, v2, v5 ; GCN-NEXT: v_max_f32_e32 v1, v1, v4 ; GCN-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7 ; GCN-NEXT: v_max_f32_e32 v2, v2, v6 ; GCN-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13 @@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 @@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -22392,52 +22022,18 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 @@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_max_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GCN-LABEL: v_canonicalize_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_canonicalize_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_canonicalize_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index dfadd8d..9472845 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0 +; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 4ed1b8a..e198197 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee ret void } -; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode: -; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GFX9: {{flat|global}}_store_dword -define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float 0.0) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float 0.0) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} @@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]] - -; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] - -; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]] -; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]] - -; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]] - -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] -define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] @@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp } ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 -; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]], -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] +; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]], +; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id @@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { ret half %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_mul_f16_e32 -; GFX9: v_pk_mul_f16 -; GFX9-NOT: v_max -; GFX9-NOT: v_pk_max -define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { - %vec.op = fmul <2 x half> %vec, - %ins.op = fmul half %val, 8.0 - %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { +; %vec.op = fmul <2 x half> %vec, +; %ins.op = fmul half %val, 8.0 +; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16: ; GFX9: v_mul_f16 @@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz: -; GCN: s_waitcnt -; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 -define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { - %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { +; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 27462130..581b7b4 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } +define half @s_test_canonicalize_arg(half %x) #1 { +; VI-LABEL: s_test_canonicalize_arg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_test_canonicalize_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: s_test_canonicalize_arg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_test_canonicalize_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %canonicalized = call half @llvm.canonicalize.f16(half %x) + ret half %canonicalized +} + define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16: @@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: @@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: @@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v6f16: @@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v12f16: @@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 ; CI-NEXT: v_or_b32_e32 v5, v7, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v22 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v19 ; CI-NEXT: v_or_b32_e32 v7, v9, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 -; CI-NEXT: v_or_b32_e32 v8, v9, v8 +; CI-NEXT: v_or_b32_e32 v8, v10, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; CI-NEXT: v_or_b32_e32 v9, v11, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_or_b32_e32 v10, v11, v10 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; CI-NEXT: v_or_b32_e32 v10, v12, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v30 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_or_b32_e32 v11, v13, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; CI-NEXT: v_or_b32_e32 v12, v13, v12 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; CI-NEXT: v_or_b32_e32 v13, v15, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v12, v15, v12 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v31 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v33 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_or_b32_e32 v13, v16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 ; CI-NEXT: v_or_b32_e32 v15, v25, v15 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: s_waitcnt vmcnt(10) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v21 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 +; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; CI-NEXT: v_or_b32_e32 v16, v24, v25 +; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; CI-NEXT: v_or_b32_e32 v25, v28, v24 ; CI-NEXT: s_waitcnt vmcnt(9) ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_or_b32_e32 v16, v17, v16 -; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; CI-NEXT: v_or_b32_e32 v17, v19, v17 ; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v19, v20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v34 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; CI-NEXT: v_or_b32_e32 v17, v17, v26 +; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 +; CI-NEXT: v_or_b32_e32 v18, v27, v18 +; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_or_b32_e32 v18, v19, v18 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; CI-NEXT: v_or_b32_e32 v19, v21, v19 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; CI-NEXT: v_or_b32_e32 v21, v27, v21 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0 +; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: s_waitcnt vmcnt(13) +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_or_b32_e32 v20, v23, v20 +; CI-NEXT: s_waitcnt vmcnt(9) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; CI-NEXT: v_or_b32_e32 v24, v25, v24 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v22, v22, v23 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_or_b32_e32 v23, v27, v23 +; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 +; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_or_b32_e32 v17, v17, v18 +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0 +; CI-NEXT: v_or_b32_e32 v25, v25, v26 +; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_or_b32_e32 v19, v24, v19 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_or_b32_e32 v21, v22, v21 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; CI-NEXT: v_or_b32_e32 v22, v23, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; CI-NEXT: v_or_b32_e32 v23, v28, v23 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v23, v23, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_or_b32_e32 v24, v24, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; CI-NEXT: v_or_b32_e32 v27, v28, v27 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_or_b32_e32 v23, v26, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; CI-NEXT: v_or_b32_e32 v28, v29, v28 +; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0 +; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 ; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index c1093a1..d53c041 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 @@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 78fb89c..b32630a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_canonicalize_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 17f6761..b5440b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index ab7ab4d..d056a97 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_max_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v2, v3, v4 +; SI-NEXT: v_max_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index b7370ce..f934a2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_min_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v2, v3, v4 +; SI-NEXT: v_min_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index fb3e79b..5b7f0e7 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX1100: ; %bb.0: ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0] ; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX906: ; %bb.0: ; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp ; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v6 -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 ; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 -; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] @@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -- cgit v1.1 From c29b265eb9b7b3b6dc44d87fe6fec8a52485847d Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Thu, 14 Mar 2024 10:28:24 +0900 Subject: Reapply "[AMDGPU] Add pal metadata 3.0 support to callable pal funcs (#67104)" This reverts commit 7d508eb5d38f4bbbab4230a666d9e742e271af61. --- .../CodeGen/AMDGPU/pal-metadata-3.0-callable.ll | 305 +++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll new file mode 100644 index 0000000..538ce15 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -0,0 +1,305 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +; CHECK: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1 +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: 0 +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: true +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0x200 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x400 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x40 +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK-NEXT: .shader_functions: +; CHECK-NEXT: dynamic_stack: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: dynamic_stack_loop: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: multiple_stack: +; CHECK-NEXT: .backend_stack_size: 0x24 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: no_stack: +; CHECK-NEXT: .backend_stack_size: 0 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: no_stack_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: no_stack_extern_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: no_stack_extern_call_many_args: +; CHECK-NEXT: .backend_stack_size: 0x90 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: no_stack_indirect_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_lds: +; CHECK-NEXT: .backend_stack_size: 0 +; CHECK-NEXT: .lds_size: 0x100 +; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: simple_lds_recurse: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0x100 +; CHECK-NEXT: .sgpr_count: 0x24 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x29 +; CHECK-NEXT: simple_stack: +; CHECK-NEXT: .backend_stack_size: 0x14 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: simple_stack_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x4 +; CHECK-NEXT: simple_stack_extern_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_stack_indirect_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_stack_recurse: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x24 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x2a +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +declare amdgpu_gfx float @extern_func(float) #0 +declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0 + +@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define amdgpu_gfx float @no_stack(float %arg0) #0 { + %add = fadd float %arg0, 1.0 + ret float %add +} + +define amdgpu_gfx float @simple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @multiple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + %stack2 = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack2 + %val2 = load volatile float, ptr addrspace(5) %stack2 + %add2 = fadd float %add, %val2 + ret float %add2 +} + +define amdgpu_gfx float @dynamic_stack(float %arg0) #0 { +bb0: + %cmp = fcmp ogt float %arg0, 0.0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + br label %bb2 + +bb2: + %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ] + ret float %res +} + +define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 { +bb0: + br label %bb1 + +bb1: + %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ] + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + %cmp = icmp sgt i32 %ctr, 0 + %newctr = sub i32 %ctr, 1 + br i1 %cmp, label %bb1, label %bb2 + +bb2: + ret float %add +} + +define amdgpu_gfx float @no_stack_call(float %arg0) #0 { + %res = call amdgpu_gfx float @simple_stack(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @simple_stack(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 { + %res = call amdgpu_gfx float @extern_func(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @extern_func(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 { + %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0) + ret float %res +} + +define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 { + %fptr = load ptr, ptr addrspace(4) @funcptr + call amdgpu_gfx void %fptr() + ret float %arg0 +} + +define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %fptr = load ptr, ptr addrspace(4) @funcptr + call amdgpu_gfx void %fptr() + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +@lds = internal addrspace(3) global [64 x float] undef + +define amdgpu_gfx float @simple_lds(float %arg0) #0 { + %val = load float, ptr addrspace(3) @lds + ret float %val +} + +define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 { + %val = load float, ptr addrspace(3) @lds + %res = call amdgpu_gfx float @simple_lds_recurse(float %val) + ret float %res +} + +attributes #0 = { nounwind } + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"} +!1 = !{i32 7} -- cgit v1.1 From 5f774619eac5db73398225a4c924a9c1d437fb40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Thu, 14 Mar 2024 12:45:19 +0100 Subject: [GlobalIsel] Combine ADDO (#82927) Perform the requested arithmetic and produce a carry output in addition to the normal result. Clang has them as builtins (__builtin_add_overflow_p). The middle end has intrinsics for them (sadd_with_overflow). AArch64: ADDS Add and set flags On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines. --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 232 ++++++------- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 441 ++++++++++++------------ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 408 +++++++++++----------- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 229 ++++++------ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 8 - llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 8 - 6 files changed, 647 insertions(+), 679 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index d36f5c0..a6f9bb7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: saddsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_add_u32 s0, s2, s6 ; GFX10-NEXT: s_addc_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_add_u32 s0, s2, s6 ; GFX11-NEXT: s_addc_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 @@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 @@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc @@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 @@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 @@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s10, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_add_u32 s0, s4, s12 @@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo @@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: s_add_u32 s0, s4, s12 @@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 0a6b7af..84906c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -3091,253 +3091,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 -; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index c455b24..83ebc84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1] ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v6, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 -; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 61e1e67..320dfbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: ssubsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_sub_u32 s0, s2, s6 ; GFX10-NEXT: s_subb_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_sub_u32 s0, s2, s6 ; GFX11-NEXT: s_subb_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s11, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s11, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 @@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 ; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 @@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s4 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc @@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc @@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 @@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 +; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 -; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 @@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s19, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 @@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 @@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 @@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_ashr_i32 s8, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: s_ashr_i32 s4, s3, 31 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 @@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_ashr_i32 s8, s19, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 887c43f..d155513 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 5c6bb6d..07480a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -- cgit v1.1 From c7c561ef98ad783d257dab3940dd2378ef8760bf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 15 Mar 2024 15:51:53 +0530 Subject: AMDGPU: Enable ExpandLargeFpConvert for > 64-bit types Fixes casts between double/float/half and i128. The pass seems to be broken for bfloat though. I also believe we could have a better implementation which attempts to make use the native 32-bit conversion instructions like the 64-bit expansion does. --- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 1510 ++++++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 1802 ++++++++++++++++++++++++++++++++ 2 files changed, 3312 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll create mode 100644 llvm/test/CodeGen/AMDGPU/itofp.i128.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll new file mode 100644 index 0000000..a229889 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -0,0 +1,1510 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define i128 @fptosi_f64_to_i128(double %x) { +; SDAG-LABEL: fptosi_f64_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB0_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB0_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 +; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: .LBB0_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v7, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: .LBB0_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB0_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptosi_f64_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB0_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: .LBB0_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: .LBB0_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB0_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB0_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi double %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f64_to_i128(double %x) { +; SDAG-LABEL: fptoui_f64_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB1_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB1_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 +; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: .LBB1_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v7, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: .LBB1_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptoui_f64_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB1_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: .LBB1_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: .LBB1_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB1_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB1_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui double %x to i128 + ret i128 %cvt +} + +define i128 @fptosi_f32_to_i128(float %x) { +; SDAG-LABEL: fptosi_f32_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB2_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB2_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_mov_b32_e32 v6, v1 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: .LBB2_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: .LBB2_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptosi_f32_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB2_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: .LBB2_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: .LBB2_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB2_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB2_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi float %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f32_to_i128(float %x) { +; SDAG-LABEL: fptoui_f32_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc +; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB3_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_mov_b32_e32 v6, v1 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: .LBB3_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: .LBB3_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptoui_f32_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: .LBB3_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: .LBB3_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB3_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB3_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui float %x to i128 + ret i128 %cvt +} + +define i128 @fptosi_f16_to_i128(half %x) { +; GCN-LABEL: fptosi_f16_to_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi half %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f16_to_i128(half %x) { +; GCN-LABEL: fptoui_f16_to_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui half %x to i128 + ret i128 %cvt +} + +; FIXME: ExpandLargeFpConvert asserts on bfloat +; define i128 @fptosi_bf16_to_i128(bfloat %x) { +; %cvt = fptosi bfloat %x to i128 +; ret i128 %cvt +; } + +; define i128 @fptoui_bf16_to_i128(bfloat %x) { +; %cvt = fptoui bfloat %x to i128 +; ret i128 %cvt +; } diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll new file mode 100644 index 0000000..e4e8d52 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -0,0 +1,1802 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define float @sitofp_i128_to_f32(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f32: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB0_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 +; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 +; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_u32_e32 v6, 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v7, v1 +; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v9 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v8 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v8 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB0_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v9 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[6:7], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v9 +; SDAG-NEXT: v_or_b32_e32 v11, v7, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v6, v10 +; SDAG-NEXT: v_lshrrev_b64 v[6:7], v13, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_add_u32_e32 v14, 26, v9 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] +; SDAG-NEXT: v_subrev_u32_e32 v9, 38, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v6, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v10 +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v5, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[5:6], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v6, v10 +; SDAG-NEXT: v_or_b32_e32 v4, v9, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v6, v15, v4 +; SDAG-NEXT: .LBB0_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v6 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v6 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 +; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v6 +; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 +; SDAG-NEXT: .LBB0_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f32: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB0_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v8, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v8, vcc +; GISEL-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v8, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v6 +; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v3, v7 +; GISEL-NEXT: v_add_u32_e32 v4, 32, v4 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v2, vcc +; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v9 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v11 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr11 +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v10 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v10 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v10 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB0_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v12, 0x66, v11 +; GISEL-NEXT: v_sub_u32_e32 v4, 64, v12 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 +; GISEL-NEXT: v_or_b32_e32 v4, v2, v4 +; GISEL-NEXT: v_or_b32_e32 v5, v3, v5 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v13, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v13, 26, v11 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_or_b32_e32 v15, v4, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v5, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v4, v4, v6 +; GISEL-NEXT: v_and_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_and_or_b32 v4, v11, v0, v4 +; GISEL-NEXT: v_and_or_b32 v5, v12, v1, v5 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v4 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB0_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] +; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v9, v10 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 +; GISEL-NEXT: v_lshl_add_u32 v1, v9, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 +; GISEL-NEXT: .LBB0_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to float + ret float %cvt +} + +define float @uitofp_i128_to_f32(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f32: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB1_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v8 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v7 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v7 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB1_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v8 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v13, 26, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] +; SDAG-NEXT: v_subrev_u32_e32 v8, 38, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v4, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v11, v9 +; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, v3, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[3:4], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v4, v9 +; SDAG-NEXT: v_or_b32_e32 v2, v8, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v14, v2 +; SDAG-NEXT: .LBB1_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v6, v7 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v9 +; SDAG-NEXT: v_lshl_or_b32 v0, v6, 23, v0 +; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 +; SDAG-NEXT: .LBB1_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f32: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB1_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v4 +; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v3, v5 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v3, v2, vcc +; GISEL-NEXT: v_sub_u32_e32 v10, 0x7f, v12 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v10 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v12 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr12 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v11, 0x80, v12 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v11 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v11 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v11 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v9, v3 +; GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GISEL-NEXT: v_mov_b32_e32 v6, v0 +; GISEL-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB1_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v8, 0x66, v12 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v8 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v8, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GISEL-NEXT: v_or_b32_e32 v6, v2, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v3, v7 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v9, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_add_u32_e32 v12, 26, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_sub_u32_e32 v8, 64, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, -1 +; GISEL-NEXT: v_lshlrev_b64 v[8:9], v8, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 +; GISEL-NEXT: v_or_b32_e32 v14, v2, v8 +; GISEL-NEXT: v_or_b32_e32 v15, v3, v9 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_and_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_and_or_b32 v2, v8, v0, v2 +; GISEL-NEXT: v_and_or_b32 v3, v9, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v6, v6, v2 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB1_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v6, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] +; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v10, v11 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_add_u32 v0, v10, 23, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GISEL-NEXT: v_and_or_b32 v2, v2, v1, v0 +; GISEL-NEXT: .LBB1_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to float + ret float %cvt +} + +define double @sitofp_i128_to_f64(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f64: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_or_b32_e32 v1, v5, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v4, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB2_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4 +; SDAG-NEXT: v_xor_b32_e32 v5, v0, v5 +; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v0 +; SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v0, vcc +; SDAG-NEXT: v_xor_b32_e32 v1, v0, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v0, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v0, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v0, v6 +; SDAG-NEXT: v_add_u32_e32 v0, 32, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v1, v7 +; SDAG-NEXT: v_min_u32_e32 v0, v0, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v1, v4 +; SDAG-NEXT: v_add_u32_e32 v1, 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v5 +; SDAG-NEXT: v_min_u32_e32 v1, v1, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 +; SDAG-NEXT: v_cndmask_b32_e32 v11, v1, v0, vcc +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v11 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v11 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v10, 0x80, v11 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v10 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v7 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB2_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v11 +; SDAG-NEXT: v_sub_u32_e32 v8, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v12, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v13, 9, v11 +; SDAG-NEXT: v_or_b32_e32 v9, v1, v9 +; SDAG-NEXT: v_or_b32_e32 v8, v0, v8 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v13, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; SDAG-NEXT: v_add_u32_e32 v9, 55, v11 +; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[14:15], v9, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v11, -9, v11 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v11, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; SDAG-NEXT: v_or_b32_e32 v13, v15, v13 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v13, v12, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v14, v11, v14, vcc +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v14, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v12, v12, v13 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v9 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v0, v9 +; SDAG-NEXT: .LBB2_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[8:9], 1, v[6:7] +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5] +; SDAG-NEXT: v_or_b32_e32 v8, v8, v6 +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc +; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v1, v7 +; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v1, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v10 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 +; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 +; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v8 +; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 +; SDAG-NEXT: .LBB2_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f64: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB2_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v8, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v5 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v8, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v9 +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v11 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr11 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v10 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v7, v3 +; GISEL-NEXT: v_mov_b32_e32 v6, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB2_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v11 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v14 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_or_b32_e32 v6, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v5, v7 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GISEL-NEXT: v_add_u32_e32 v7, 55, v11 +; GISEL-NEXT: v_sub_u32_e32 v13, 64, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v7, -1 +; GISEL-NEXT: v_lshlrev_b64 v[13:14], v13, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v7 +; GISEL-NEXT: v_or_b32_e32 v16, v11, v13 +; GISEL-NEXT: v_or_b32_e32 v17, v12, v14 +; GISEL-NEXT: v_lshrrev_b64 v[13:14], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v13, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, v14, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, v11, v2 +; GISEL-NEXT: v_and_b32_e32 v12, v12, v3 +; GISEL-NEXT: v_and_or_b32 v11, v7, v0, v11 +; GISEL-NEXT: v_and_or_b32 v12, v13, v1, v12 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v4, v4, v7 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB2_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GISEL-NEXT: v_or_b32_e32 v6, v2, v0 +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v4, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GISEL-NEXT: v_and_b32_e32 v7, 0x800000, v1 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_lshl_or_b32 v6, v2, 30, v5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v9, v10 +; GISEL-NEXT: v_lshl_or_b32 v6, v2, 29, v5 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff +; GISEL-NEXT: v_lshl_add_u32 v1, v9, 20, v1 +; GISEL-NEXT: v_and_or_b32 v2, v6, v2, v0 +; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 +; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 +; GISEL-NEXT: .LBB2_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to double + ret double %cvt +} + +define double @uitofp_i128_to_f64(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f64: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v11, v5, v4, vcc +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: v_sub_u32_e32 v9, 0x7f, v11 +; SDAG-NEXT: v_mov_b32_e32 v6, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v3 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v9 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mov_b32_e32 v7, v2 +; SDAG-NEXT: ; implicit-def: $vgpr12 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v11 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v10, 0x80, v11 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v10 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB3_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v8, 0x49, v11 +; SDAG-NEXT: v_sub_u32_e32 v6, 64, v8 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v8, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v13, 9, v11 +; SDAG-NEXT: v_or_b32_e32 v7, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v12, v4, v6 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v13, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v6, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v7, v4, v12, vcc +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v8, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v16, 55, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[14:15], v16, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; SDAG-NEXT: v_add_u32_e32 v4, -9, v11 +; SDAG-NEXT: v_lshlrev_b64 v[4:5], v4, v[0:1] +; SDAG-NEXT: v_or_b32_e32 v13, v15, v13 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; SDAG-NEXT: v_or_b32_e32 v12, v14, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v12, v4, v12, vcc +; SDAG-NEXT: v_lshlrev_b64 v[4:5], v16, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v12 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v8, v4 +; SDAG-NEXT: .LBB3_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[7:8], 1, v[2:3] +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; SDAG-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; SDAG-NEXT: v_or_b32_e32 v7, v7, v2 +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v5 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v5 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v7, vcc +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SDAG-NEXT: v_alignbit_b32 v12, v2, v1, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; SDAG-NEXT: v_alignbit_b32 v12, v2, v1, 3 +; SDAG-NEXT: v_mov_b32_e32 v9, v10 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v12 +; SDAG-NEXT: v_lshl_or_b32 v0, v9, 20, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 +; SDAG-NEXT: .LBB3_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f64: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x7f, v10 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v10 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v9, 0x80, v10 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v9 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v9 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 54, v9 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v7, v3 +; GISEL-NEXT: v_mov_b32_e32 v6, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB3_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v10 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v13 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v6, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v5, v7 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_add_u32_e32 v14, 55, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v12, vcc +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v11, vcc +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v14, -1 +; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 +; GISEL-NEXT: v_or_b32_e32 v17, v11, v13 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, v13, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v10, v10, v2 +; GISEL-NEXT: v_and_b32_e32 v11, v11, v3 +; GISEL-NEXT: v_and_or_b32 v10, v12, v0, v10 +; GISEL-NEXT: v_and_or_b32 v11, v13, v1, v11 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v4, v4, v10 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB3_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GISEL-NEXT: v_or_b32_e32 v6, v6, v0 +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v4, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_and_b32_e32 v6, 0x800000, v1 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[5:6] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], 30, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 2, v1 +; GISEL-NEXT: v_or_b32_e32 v6, v6, v5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v6, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 +; GISEL-NEXT: v_lshl_add_u32 v0, v8, 20, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v6 +; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 +; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 +; GISEL-NEXT: .LBB3_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to double + ret double %cvt +} + +define half @sitofp_i128_to_f16(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f16: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB4_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 +; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 +; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_u32_e32 v6, 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v7, v1 +; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v9 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB4_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v8 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v8 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB4_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v9 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[6:7], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v9 +; SDAG-NEXT: v_or_b32_e32 v11, v7, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v6, v10 +; SDAG-NEXT: v_lshrrev_b64 v[6:7], v13, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_add_u32_e32 v14, 26, v9 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] +; SDAG-NEXT: v_subrev_u32_e32 v9, 38, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v6, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v10 +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v5, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[5:6], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v6, v10 +; SDAG-NEXT: v_or_b32_e32 v4, v9, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v6, v15, v4 +; SDAG-NEXT: .LBB4_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v6 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v6 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 +; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v6 +; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SDAG-NEXT: .LBB4_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f16: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB4_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v8, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v8, vcc +; GISEL-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v8, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v6 +; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v3, v7 +; GISEL-NEXT: v_add_u32_e32 v4, 32, v4 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v2, vcc +; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v9 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v11 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr11 +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB4_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v10 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v10 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v10 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB4_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v12, 0x66, v11 +; GISEL-NEXT: v_sub_u32_e32 v4, 64, v12 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 +; GISEL-NEXT: v_or_b32_e32 v4, v2, v4 +; GISEL-NEXT: v_or_b32_e32 v5, v3, v5 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v13, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v13, 26, v11 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_or_b32_e32 v15, v4, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v5, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v4, v4, v6 +; GISEL-NEXT: v_and_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_and_or_b32 v4, v11, v0, v4 +; GISEL-NEXT: v_and_or_b32 v5, v12, v1, v5 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v4 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB4_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] +; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v9, v10 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 +; GISEL-NEXT: v_lshl_add_u32 v1, v9, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GISEL-NEXT: .LBB4_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to half + ret half %cvt +} + +define half @uitofp_i128_to_f16(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f16: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB5_16 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v8 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB5_15 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v7 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SDAG-NEXT: ; %bb.6: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: ; %bb.7: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v7 +; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec +; SDAG-NEXT: s_mov_b64 s[10:11], exec +; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.8: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB5_10 +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v8 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v13, 26, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] +; SDAG-NEXT: v_subrev_u32_e32 v8, 38, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v4, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v11, v9 +; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, v3, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[3:4], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v4, v9 +; SDAG-NEXT: v_or_b32_e32 v2, v8, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v14, v2 +; SDAG-NEXT: .LBB5_10: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] +; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v6, v7 +; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_15: ; %Flow7 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v9 +; SDAG-NEXT: v_lshl_or_b32 v0, v6, 23, v0 +; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SDAG-NEXT: .LBB5_16: ; %Flow8 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f16: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB5_16 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v4 +; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v3, v5 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v3, v2, vcc +; GISEL-NEXT: v_sub_u32_e32 v10, 0x7f, v12 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v10 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v12 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr12 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB5_15 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_sub_u32_e32 v11, 0x80, v12 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v11 +; GISEL-NEXT: s_mov_b64 s[10:11], 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v11 +; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.6: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; GISEL-NEXT: ; %bb.7: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v11 +; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 +; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc +; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] +; GISEL-NEXT: ; %bb.8: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v9, v3 +; GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GISEL-NEXT: v_mov_b32_e32 v6, v0 +; GISEL-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] +; GISEL-NEXT: s_cbranch_execz .LBB5_10 +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v8, 0x66, v12 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v8 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v8, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GISEL-NEXT: v_or_b32_e32 v6, v2, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v3, v7 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v9, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_add_u32_e32 v12, 26, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_sub_u32_e32 v8, 64, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, -1 +; GISEL-NEXT: v_lshlrev_b64 v[8:9], v8, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 +; GISEL-NEXT: v_or_b32_e32 v14, v2, v8 +; GISEL-NEXT: v_or_b32_e32 v15, v3, v9 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_and_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_and_or_b32 v2, v8, v0, v2 +; GISEL-NEXT: v_and_or_b32 v3, v9, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 +; GISEL-NEXT: v_or_b32_e32 v6, v6, v2 +; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] +; GISEL-NEXT: .LBB5_10: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] +; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v0, v6, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] +; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v10, v11 +; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_15: ; %Flow7 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_add_u32 v0, v10, 23, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GISEL-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GISEL-NEXT: .LBB5_16: ; %Flow8 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to half + ret half %cvt +} + +; FIXME: ExpandLargeFpConvert asserts on bfloat +; define bfloat @sitofp_i128_to_bf16(i128 %x) { +; %cvt = sitofp i128 %x to bfloat +; ret bfloat %cvt +; } + +; define bfloat @uitofp_i128_to_bf16(i128 %x) { +; %cvt = uitofp i128 %x to bfloat +; ret bfloat %cvt +; } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} -- cgit v1.1 From 9b5d9a81bd2695443254be8489f4325fbb259776 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 15 Mar 2024 16:23:34 +0530 Subject: AMDGPU: Regenerate test checks from c7c561ef9 The test output changed after initial commit/test in 5f774619eac5db73398225a4c924a9c1d437fb40 --- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index a229889..b2311a8 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -365,7 +365,7 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 ; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 -; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB0_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] @@ -739,7 +739,7 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 ; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 -; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB1_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1099,7 +1099,7 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 ; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 -; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB2_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1459,7 +1459,7 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 ; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 -; GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB3_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -- cgit v1.1 From 092999e70b349ac521cab2648152ababeb12873f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 15 Mar 2024 14:08:43 +0000 Subject: [AMDGPU] Update checks in new test after #85370 --- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 2186 +++++++++++++++----------------- 1 file changed, 1001 insertions(+), 1185 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index e4e8d52..bfeb214 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -11,7 +11,7 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_16 +; SDAG-NEXT: s_cbranch_execz .LBB0_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 @@ -32,112 +32,100 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 -; SDAG-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 -; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v9 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_15 +; SDAG-NEXT: s_cbranch_execz .LBB0_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v8 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v8 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB0_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v9 +; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 -; SDAG-NEXT: v_lshrrev_b64 v[6:7], v12, v[0:1] +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] -; SDAG-NEXT: v_sub_u32_e32 v13, 38, v9 -; SDAG-NEXT: v_or_b32_e32 v11, v7, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v6, v10 -; SDAG-NEXT: v_lshrrev_b64 v[6:7], v13, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; SDAG-NEXT: v_add_u32_e32 v14, 26, v9 -; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; SDAG-NEXT: v_add_u32_e32 v14, 26, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] -; SDAG-NEXT: v_subrev_u32_e32 v9, 38, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v15, v6, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v6, v13, v11 -; SDAG-NEXT: v_or_b32_e32 v11, v12, v10 -; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1] +; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v11, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v5, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[5:6], v14, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; SDAG-NEXT: v_or_b32_e32 v5, v6, v10 -; SDAG-NEXT: v_or_b32_e32 v4, v9, v4 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, v15, v4 -; SDAG-NEXT: .LBB0_10: ; %Flow5 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: .LBB0_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB0_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v6 -; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v6 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 2 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 -; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v2, v8 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB0_15: ; %Flow7 +; SDAG-NEXT: .LBB0_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 -; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v6 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 -; SDAG-NEXT: .LBB0_16: ; %Flow8 +; SDAG-NEXT: .LBB0_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -151,144 +139,126 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_16 +; GISEL-NEXT: s_cbranch_execz .LBB0_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v0, v8, v0 -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v1 -; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 -; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 -; GISEL-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v8, vcc -; GISEL-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v8, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v4, v6 -; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v3, v7 -; GISEL-NEXT: v_add_u32_e32 v4, 32, v4 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v2, vcc -; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v9 -; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v11 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr11 -; GISEL-NEXT: ; implicit-def: $vgpr6 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_15 +; GISEL-NEXT: s_cbranch_execz .LBB0_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v10 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v10 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v10 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v5, v3 -; GISEL-NEXT: v_mov_b32_e32 v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB0_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v12, 0x66, v11 -; GISEL-NEXT: v_sub_u32_e32 v4, 64, v12 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 -; GISEL-NEXT: v_or_b32_e32 v4, v2, v4 -; GISEL-NEXT: v_or_b32_e32 v5, v3, v5 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v13, v[6:7] -; GISEL-NEXT: v_add_u32_e32 v13, 26, v11 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, -1 +; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 -; GISEL-NEXT: v_or_b32_e32 v15, v4, v11 -; GISEL-NEXT: v_or_b32_e32 v16, v5, v12 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v4, v4, v6 -; GISEL-NEXT: v_and_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_and_or_b32 v4, v11, v0, v4 -; GISEL-NEXT: v_and_or_b32 v5, v12, v1, v5 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v4 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB0_10: ; %Flow5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB0_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB0_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v9, v10 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB0_15: ; %Flow7 +; GISEL-NEXT: .LBB0_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 -; GISEL-NEXT: v_lshl_add_u32 v1, v9, 23, 1.0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 -; GISEL-NEXT: .LBB0_16: ; %Flow8 +; GISEL-NEXT: .LBB0_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -305,7 +275,7 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_16 +; SDAG-NEXT: s_cbranch_execz .LBB1_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -317,113 +287,99 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 -; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc -; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v8 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_15 +; SDAG-NEXT: s_cbranch_execz .LBB1_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v7 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v7 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB1_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v8 +; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] -; SDAG-NEXT: v_sub_u32_e32 v12, 38, v8 -; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 -; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v7, v9 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 -; SDAG-NEXT: v_add_u32_e32 v13, 26, v8 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_add_u32_e32 v13, 26, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] -; SDAG-NEXT: v_subrev_u32_e32 v8, 38, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v4, v12, v10 -; SDAG-NEXT: v_or_b32_e32 v10, v11, v9 -; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, v3, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[3:4], v13, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v4, v9 -; SDAG-NEXT: v_or_b32_e32 v2, v8, v2 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v4, v14, v2 -; SDAG-NEXT: .LBB1_10: ; %Flow5 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-NEXT: v_mov_b32_e32 v1, v8 +; SDAG-NEXT: .LBB1_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB1_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 -; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 2 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 -; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v6, v7 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB1_15: ; %Flow7 +; SDAG-NEXT: .LBB1_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v9 -; SDAG-NEXT: v_lshl_or_b32 v0, v6, 23, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 +; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 -; SDAG-NEXT: .LBB1_16: ; %Flow8 +; SDAG-NEXT: .LBB1_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -431,144 +387,124 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f32: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v4 -; GISEL-NEXT: v_or_b32_e32 v3, v1, v5 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_16 +; GISEL-NEXT: s_cbranch_execz .LBB1_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end -; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v6, v4 -; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v3, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 ; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v3, v2, vcc -; GISEL-NEXT: v_sub_u32_e32 v10, 0x7f, v12 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v10 -; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v12 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr12 -; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_15 +; GISEL-NEXT: s_cbranch_execz .LBB1_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v11, 0x80, v12 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v11 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v11 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v11 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v9, v3 -; GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB1_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v8, 0x66, v12 -; GISEL-NEXT: v_sub_u32_e32 v6, 64, v8 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v8, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[4:5] -; GISEL-NEXT: v_subrev_u32_e32 v9, 64, v8 -; GISEL-NEXT: v_or_b32_e32 v6, v2, v6 -; GISEL-NEXT: v_or_b32_e32 v7, v3, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v9, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 -; GISEL-NEXT: v_add_u32_e32 v12, 26, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_sub_u32_e32 v8, 64, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, -1 -; GISEL-NEXT: v_lshlrev_b64 v[8:9], v8, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 -; GISEL-NEXT: v_or_b32_e32 v14, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v15, v3, v9 -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v2, v2, v4 -; GISEL-NEXT: v_and_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_and_or_b32 v2, v8, v0, v2 -; GISEL-NEXT: v_and_or_b32 v3, v9, v1, v3 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v6, v6, v2 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB1_10: ; %Flow5 +; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB1_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB1_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v6, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v10, v11 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB1_15: ; %Flow7 +; GISEL-NEXT: .LBB1_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_add_u32 v0, v10, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff -; GISEL-NEXT: v_and_or_b32 v2, v2, v1, v0 -; GISEL-NEXT: .LBB1_16: ; %Flow8 +; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0 +; GISEL-NEXT: .LBB1_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to float ret float %cvt @@ -586,7 +522,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_16 +; SDAG-NEXT: s_cbranch_execz .LBB2_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4 @@ -607,128 +543,116 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v1, v1, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 -; SDAG-NEXT: v_cndmask_b32_e32 v11, v1, v0, vcc -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v11 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc +; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v11 +; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr11 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_15 +; SDAG-NEXT: s_cbranch_execz .LBB2_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v10, 0x80, v11 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v10 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: v_mov_b32_e32 v0, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, v7 -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mov_b32_e32 v8, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB2_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v11 -; SDAG-NEXT: v_sub_u32_e32 v8, 64, v12 +; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v9 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v12, v[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[6:7] -; SDAG-NEXT: v_sub_u32_e32 v13, 9, v11 -; SDAG-NEXT: v_or_b32_e32 v9, v1, v9 -; SDAG-NEXT: v_or_b32_e32 v8, v0, v8 +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v13, 9, v9 +; SDAG-NEXT: v_or_b32_e32 v11, v1, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v0, v10 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v13, v[6:7] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v16, 55, v9 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; SDAG-NEXT: v_add_u32_e32 v9, 55, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v12, v[6:7] ; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[14:15], v9, v[6:7] -; SDAG-NEXT: v_add_u32_e32 v11, -9, v11 +; SDAG-NEXT: v_lshlrev_b64 v[14:15], v16, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v9, -9, v9 +; SDAG-NEXT: v_or_b32_e32 v15, v15, v13 ; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 -; SDAG-NEXT: v_lshlrev_b64 v[11:12], v11, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc -; SDAG-NEXT: v_or_b32_e32 v13, v15, v13 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9 +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v13, v12, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v14, v11, v14, vcc -; SDAG-NEXT: v_lshlrev_b64 v[11:12], v9, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v14, v6, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v12, v12, v13 -; SDAG-NEXT: v_or_b32_e32 v11, v11, v9 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v0, v0, v9 -; SDAG-NEXT: .LBB2_10: ; %Flow5 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v13, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; SDAG-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v6, v10 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v7, v11 +; SDAG-NEXT: .LBB2_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[8:9], 1, v[6:7] -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5] -; SDAG-NEXT: v_or_b32_e32 v8, v8, v6 -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB2_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v5 +; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v6, v0 +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 -; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 ; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v1, v7 +; SDAG-NEXT: v_or_b32_e32 v10, v1, v7 ; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v1, v2 -; SDAG-NEXT: v_mov_b32_e32 v2, v10 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: v_or_b32_e32 v10, v1, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB2_15: ; %Flow7 +; SDAG-NEXT: .LBB2_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 ; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 -; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v8 +; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10 ; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 -; SDAG-NEXT: .LBB2_16: ; %Flow8 +; SDAG-NEXT: .LBB2_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -744,156 +668,142 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_16 +; GISEL-NEXT: s_cbranch_execz .LBB2_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v0, v8, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v5 -; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 -; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 -; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v8, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v5 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 -; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 ; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 -; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 ; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 ; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 -; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 -; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v9 -; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v11 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr11 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_15 +; GISEL-NEXT: s_cbranch_execz .LBB2_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v10 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v7, v3 -; GISEL-NEXT: v_mov_b32_e32 v6, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB2_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v11 -; GISEL-NEXT: v_sub_u32_e32 v6, 64, v14 +; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 -; GISEL-NEXT: v_or_b32_e32 v6, v4, v6 -; GISEL-NEXT: v_or_b32_e32 v7, v5, v7 +; GISEL-NEXT: v_or_b32_e32 v10, v4, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v5, v11 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GISEL-NEXT: v_add_u32_e32 v7, 55, v11 -; GISEL-NEXT: v_sub_u32_e32 v13, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v7, -1 -; GISEL-NEXT: v_lshlrev_b64 v[13:14], v13, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v7 -; GISEL-NEXT: v_or_b32_e32 v16, v11, v13 -; GISEL-NEXT: v_or_b32_e32 v17, v12, v14 -; GISEL-NEXT: v_lshrrev_b64 v[13:14], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v13, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v14, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, v11, v2 -; GISEL-NEXT: v_and_b32_e32 v12, v12, v3 -; GISEL-NEXT: v_and_or_b32 v11, v7, v0, v11 -; GISEL-NEXT: v_and_or_b32 v12, v13, v1, v12 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v4, v4, v7 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB2_10: ; %Flow5 +; GISEL-NEXT: v_add_u32_e32 v14, 55, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_or_b32_e32 v16, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v17, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v11, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb +; GISEL-NEXT: .LBB2_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GISEL-NEXT: v_or_b32_e32 v6, v2, v0 -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: v_or_b32_e32 v11, v2, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v9 +; GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GISEL-NEXT: v_mov_b32_e32 v2, v11 +; GISEL-NEXT: v_mov_b32_e32 v3, v12 +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v4, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v6, 0 -; GISEL-NEXT: v_and_b32_e32 v7, 0x800000, v1 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GISEL-NEXT: v_lshl_or_b32 v6, v2, 30, v5 +; GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v9, v10 -; GISEL-NEXT: v_lshl_or_b32 v6, v2, 29, v5 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB2_15: ; %Flow7 +; GISEL-NEXT: .LBB2_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff -; GISEL-NEXT: v_lshl_add_u32 v1, v9, 20, v1 -; GISEL-NEXT: v_and_or_b32 v2, v6, v2, v0 +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 20, v1 +; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0 ; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 -; GISEL-NEXT: .LBB2_16: ; %Flow8 +; GISEL-NEXT: .LBB2_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to double @@ -910,7 +820,7 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_16 +; SDAG-NEXT: s_cbranch_execz .LBB3_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -922,128 +832,112 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 -; SDAG-NEXT: v_cndmask_b32_e32 v11, v5, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: v_sub_u32_e32 v9, 0x7f, v11 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mov_b32_e32 v8, v3 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v9 -; SDAG-NEXT: v_mov_b32_e32 v5, v0 -; SDAG-NEXT: v_mov_b32_e32 v7, v2 -; SDAG-NEXT: ; implicit-def: $vgpr12 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v11 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr11 -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_15 +; SDAG-NEXT: s_cbranch_execz .LBB3_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v10, 0x80, v11 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v10 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v10 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 54, v10 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB3_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v8, 0x49, v11 -; SDAG-NEXT: v_sub_u32_e32 v6, 64, v8 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v8, v[0:1] -; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] -; SDAG-NEXT: v_sub_u32_e32 v13, 9, v11 -; SDAG-NEXT: v_or_b32_e32 v7, v5, v7 -; SDAG-NEXT: v_or_b32_e32 v12, v4, v6 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v13, v[2:3] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v6, v5, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v7, v4, v12, vcc -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v8, v[2:3] -; SDAG-NEXT: v_add_u32_e32 v16, 55, v11 -; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[0:1] -; SDAG-NEXT: v_lshlrev_b64 v[14:15], v16, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc -; SDAG-NEXT: v_add_u32_e32 v4, -9, v11 -; SDAG-NEXT: v_lshlrev_b64 v[4:5], v4, v[0:1] -; SDAG-NEXT: v_or_b32_e32 v13, v15, v13 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; SDAG-NEXT: v_or_b32_e32 v12, v14, v12 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v12, v4, v12, vcc -; SDAG-NEXT: v_lshlrev_b64 v[4:5], v16, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v12 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v5, v8, v4 -; SDAG-NEXT: .LBB3_10: ; %Flow5 +; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x49, v8 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 9, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v15, 55, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v11, v[2:3] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[13:14], v15, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v8, -9, v8 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v11 +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v8, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v11, v13, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mov_b32_e32 v3, v10 +; SDAG-NEXT: .LBB3_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[7:8], 1, v[2:3] -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v1 -; SDAG-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; SDAG-NEXT: v_or_b32_e32 v7, v7, v2 -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB3_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; SDAG-NEXT: v_lshrrev_b32_e32 v3, 31, v1 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: v_or_b32_e32 v2, v2, v3 +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v5 -; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v5 +; SDAG-NEXT: v_lshrrev_b32_e32 v3, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v3, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc -; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SDAG-NEXT: v_alignbit_b32 v12, v2, v1, 2 +; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; SDAG-NEXT: v_alignbit_b32 v12, v2, v1, 3 -; SDAG-NEXT: v_mov_b32_e32 v9, v10 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3 +; SDAG-NEXT: v_mov_b32_e32 v6, v7 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB3_15: ; %Flow7 +; SDAG-NEXT: .LBB3_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v12 -; SDAG-NEXT: v_lshl_or_b32 v0, v9, 20, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9 +; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0 ; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 -; SDAG-NEXT: .LBB3_16: ; %Flow8 +; SDAG-NEXT: .LBB3_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: v_mov_b32_e32 v1, v5 @@ -1059,7 +953,7 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_16 +; GISEL-NEXT: s_cbranch_execz .LBB3_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -1071,139 +965,125 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v8, 0x7f, v10 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 -; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v10 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr10 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_15 +; GISEL-NEXT: s_cbranch_execz .LBB3_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v9, 0x80, v10 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v9 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v9 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 54, v9 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v7, v3 -; GISEL-NEXT: v_mov_b32_e32 v6, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB3_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v10 -; GISEL-NEXT: v_sub_u32_e32 v6, 64, v13 +; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v8 +; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] ; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 ; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3] -; GISEL-NEXT: v_or_b32_e32 v6, v4, v6 -; GISEL-NEXT: v_or_b32_e32 v7, v5, v7 +; GISEL-NEXT: v_or_b32_e32 v9, v4, v9 +; GISEL-NEXT: v_or_b32_e32 v10, v5, v10 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; GISEL-NEXT: v_add_u32_e32 v14, 55, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v12, vcc -; GISEL-NEXT: v_sub_u32_e32 v12, 64, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GISEL-NEXT: v_add_u32_e32 v8, 55, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v11, vcc -; GISEL-NEXT: v_lshrrev_b64 v[10:11], v14, -1 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v1, s[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v8, -1 ; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 -; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 -; GISEL-NEXT: v_or_b32_e32 v17, v11, v13 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v8 +; GISEL-NEXT: v_or_b32_e32 v16, v4, v12 +; GISEL-NEXT: v_or_b32_e32 v17, v5, v13 ; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v13, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, v10, v2 -; GISEL-NEXT: v_and_b32_e32 v11, v11, v3 -; GISEL-NEXT: v_and_or_b32 v10, v12, v0, v10 -; GISEL-NEXT: v_and_or_b32 v11, v13, v1, v11 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v4, v4, v10 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB3_10: ; %Flow5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v4, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v5, v3 +; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v14, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: .LBB3_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[2:3] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] +; GISEL-NEXT: .LBB3_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GISEL-NEXT: v_or_b32_e32 v6, v6, v0 -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v4, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: v_and_b32_e32 v9, 0x800000, v1 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GISEL-NEXT: v_and_b32_e32 v6, 0x800000, v1 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[5:6] -; GISEL-NEXT: v_lshlrev_b64 v[5:6], 30, v[2:3] -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 2, v1 -; GISEL-NEXT: v_or_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1 +; GISEL-NEXT: v_or_b32_e32 v9, v5, v8 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 -; GISEL-NEXT: v_or_b32_e32 v6, v0, v2 -; GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB3_15: ; %Flow7 +; GISEL-NEXT: .LBB3_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GISEL-NEXT: v_lshl_add_u32 v0, v8, 20, v0 -; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v6 +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 ; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 -; GISEL-NEXT: .LBB3_16: ; %Flow8 +; GISEL-NEXT: .LBB3_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: v_mov_b32_e32 v1, v5 @@ -1221,7 +1101,7 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_16 +; SDAG-NEXT: s_cbranch_execz .LBB4_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 @@ -1242,113 +1122,101 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 -; SDAG-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 -; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v9 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB4_15 +; SDAG-NEXT: s_cbranch_execz .LBB4_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v8 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB4_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v8 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: v_mov_b32_e32 v7, v1 -; SDAG-NEXT: v_mov_b32_e32 v6, v0 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB4_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v9 +; SDAG-NEXT: s_cbranch_execz .LBB4_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 -; SDAG-NEXT: v_lshrrev_b64 v[6:7], v12, v[0:1] +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] -; SDAG-NEXT: v_sub_u32_e32 v13, 38, v9 -; SDAG-NEXT: v_or_b32_e32 v11, v7, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v6, v10 -; SDAG-NEXT: v_lshrrev_b64 v[6:7], v13, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; SDAG-NEXT: v_add_u32_e32 v14, 26, v9 -; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; SDAG-NEXT: v_add_u32_e32 v14, 26, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] -; SDAG-NEXT: v_subrev_u32_e32 v9, 38, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v15, v6, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v6, v13, v11 -; SDAG-NEXT: v_or_b32_e32 v11, v12, v10 -; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1] +; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v11, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v5, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[5:6], v14, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; SDAG-NEXT: v_or_b32_e32 v5, v6, v10 -; SDAG-NEXT: v_or_b32_e32 v4, v9, v4 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, v15, v4 -; SDAG-NEXT: .LBB4_10: ; %Flow5 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: .LBB4_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB4_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v6 -; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v6 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 2 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 -; SDAG-NEXT: v_alignbit_b32 v6, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v2, v8 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB4_15: ; %Flow7 +; SDAG-NEXT: .LBB4_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 -; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v6 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB4_16: ; %Flow8 +; SDAG-NEXT: .LBB4_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1362,145 +1230,127 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_16 +; GISEL-NEXT: s_cbranch_execz .LBB4_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v0, v8, v0 -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v1 -; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v2, v8, v2 -; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v8, v3 -; GISEL-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v8, vcc -; GISEL-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v8, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v4, v6 -; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v3, v7 -; GISEL-NEXT: v_add_u32_e32 v4, 32, v4 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v2, vcc -; GISEL-NEXT: v_sub_u32_e32 v9, 0x7f, v11 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v9 -; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v11 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr11 -; GISEL-NEXT: ; implicit-def: $vgpr6 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB4_15 +; GISEL-NEXT: s_cbranch_execz .LBB4_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v10, 0x80, v11 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v10 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB4_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v10 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v10 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v5, v3 -; GISEL-NEXT: v_mov_b32_e32 v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB4_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v12, 0x66, v11 -; GISEL-NEXT: v_sub_u32_e32 v4, 64, v12 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 -; GISEL-NEXT: v_or_b32_e32 v4, v2, v4 -; GISEL-NEXT: v_or_b32_e32 v5, v3, v5 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v13, v[6:7] -; GISEL-NEXT: v_add_u32_e32 v13, 26, v11 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, -1 +; GISEL-NEXT: s_cbranch_execz .LBB4_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 -; GISEL-NEXT: v_or_b32_e32 v15, v4, v11 -; GISEL-NEXT: v_or_b32_e32 v16, v5, v12 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v4, v4, v6 -; GISEL-NEXT: v_and_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_and_or_b32 v4, v11, v0, v4 -; GISEL-NEXT: v_and_or_b32 v5, v12, v1, v5 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v4 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB4_10: ; %Flow5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB4_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB4_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v9, v10 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB4_15: ; %Flow7 +; GISEL-NEXT: .LBB4_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v8 -; GISEL-NEXT: v_lshl_add_u32 v1, v9, 23, 1.0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GISEL-NEXT: .LBB4_16: ; %Flow8 +; GISEL-NEXT: .LBB4_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1517,7 +1367,7 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB5_16 +; SDAG-NEXT: s_cbranch_execz .LBB5_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -1529,114 +1379,100 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 -; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc -; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v8 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow6 +; SDAG-NEXT: ; %bb.3: ; %Flow3 ; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB5_15 +; SDAG-NEXT: s_cbranch_execz .LBB5_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v7 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB5_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: ; %bb.5: ; %LeafBlock1 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SDAG-NEXT: ; %bb.6: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: ; %bb.7: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 25, v7 -; SDAG-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SDAG-NEXT: s_and_b64 s[14:15], vcc, exec -; SDAG-NEXT: s_mov_b64 s[10:11], exec -; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.8: ; %Flow4 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB5_10 -; SDAG-NEXT: ; %bb.9: ; %itofp-sw-default -; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v8 +; SDAG-NEXT: s_cbranch_execz .LBB5_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] -; SDAG-NEXT: v_sub_u32_e32 v12, 38, v8 -; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 -; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 -; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v7, v9 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 -; SDAG-NEXT: v_add_u32_e32 v13, 26, v8 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_add_u32_e32 v13, 26, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] ; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] -; SDAG-NEXT: v_subrev_u32_e32 v8, 38, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v4, v12, v10 -; SDAG-NEXT: v_or_b32_e32 v10, v11, v9 -; SDAG-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, v3, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[3:4], v13, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v4, v9 -; SDAG-NEXT: v_or_b32_e32 v2, v8, v2 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: s_andn2_b64 s[10:11], s[10:11], exec -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v4, v14, v2 -; SDAG-NEXT: .LBB5_10: ; %Flow5 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-NEXT: v_mov_b32_e32 v1, v8 +; SDAG-NEXT: .LBB5_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; SDAG-NEXT: ; %bb.11: ; %itofp-sw-bb -; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] -; SDAG-NEXT: ; %bb.12: ; %itofp-sw-epilog +; SDAG-NEXT: .LBB5_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 -; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 2 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: ; %bb.13: ; %itofp-if-then20 -; SDAG-NEXT: v_alignbit_b32 v9, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v6, v7 -; SDAG-NEXT: ; %bb.14: ; %Flow +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB5_15: ; %Flow7 +; SDAG-NEXT: .LBB5_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v9 -; SDAG-NEXT: v_lshl_or_b32 v0, v6, 23, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 +; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB5_16: ; %Flow8 +; SDAG-NEXT: .LBB5_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1644,145 +1480,125 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f16: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v0, v4 -; GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_16 +; GISEL-NEXT: s_cbranch_execz .LBB5_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end -; GISEL-NEXT: v_ffbh_u32_e32 v3, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GISEL-NEXT: v_add_u32_e32 v3, 32, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v6, v4 -; GISEL-NEXT: v_min_u32_e32 v2, v2, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v3, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 ; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: v_add_u32_e32 v2, 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v3, v2, vcc -; GISEL-NEXT: v_sub_u32_e32 v10, 0x7f, v12 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v10 -; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v12 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr12 -; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: ; %bb.3: ; %Flow6 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB5_15 +; GISEL-NEXT: s_cbranch_execz .LBB5_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_sub_u32_e32 v11, 0x80, v12 -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v11 -; GISEL-NEXT: s_mov_b64 s[10:11], 0 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB5_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: ; %bb.5: ; %LeafBlock1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v11 -; GISEL-NEXT: s_andn2_b64 s[4:5], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.6: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; GISEL-NEXT: ; %bb.7: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 25, v11 -; GISEL-NEXT: s_andn2_b64 s[10:11], 0, exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, -1 -; GISEL-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GISEL-NEXT: s_and_b64 s[14:15], exec, vcc -; GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] -; GISEL-NEXT: ; %bb.8: ; %Flow4 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v9, v3 -; GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[12:13] -; GISEL-NEXT: s_cbranch_execz .LBB5_10 -; GISEL-NEXT: ; %bb.9: ; %itofp-sw-default -; GISEL-NEXT: v_sub_u32_e32 v8, 0x66, v12 -; GISEL-NEXT: v_sub_u32_e32 v6, 64, v8 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v8, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[6:7], v6, v[4:5] -; GISEL-NEXT: v_subrev_u32_e32 v9, 64, v8 -; GISEL-NEXT: v_or_b32_e32 v6, v2, v6 -; GISEL-NEXT: v_or_b32_e32 v7, v3, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v9, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 -; GISEL-NEXT: v_add_u32_e32 v12, 26, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_sub_u32_e32 v8, 64, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v12, -1 -; GISEL-NEXT: v_lshlrev_b64 v[8:9], v8, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v12 -; GISEL-NEXT: v_or_b32_e32 v14, v2, v8 -; GISEL-NEXT: v_or_b32_e32 v15, v3, v9 -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v2, v2, v4 -; GISEL-NEXT: v_and_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_and_or_b32 v2, v8, v0, v2 -; GISEL-NEXT: v_and_or_b32 v3, v9, v1, v3 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_andn2_b64 s[4:5], s[10:11], exec -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: s_and_b64 s[10:11], exec, 0 -; GISEL-NEXT: v_or_b32_e32 v6, v6, v2 -; GISEL-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GISEL-NEXT: .LBB5_10: ; %Flow5 +; GISEL-NEXT: s_cbranch_execz .LBB5_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB5_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: ; %bb.11: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[6:7], 1, v[0:1] -; GISEL-NEXT: ; %bb.12: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB5_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v0, v6, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 2, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v3, 0x4000000, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: ; %bb.13: ; %itofp-if-then20 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v10, v11 -; GISEL-NEXT: ; %bb.14: ; %Flow +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB5_15: ; %Flow7 +; GISEL-NEXT: .LBB5_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_add_u32 v0, v10, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff -; GISEL-NEXT: v_and_or_b32 v0, v2, v1, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GISEL-NEXT: .LBB5_16: ; %Flow8 +; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GISEL-NEXT: .LBB5_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to half ret half %cvt -- cgit v1.1 From ec34699f750efc5292db503c6700ddeede59ff03 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Mon, 18 Mar 2024 10:34:11 +0530 Subject: [GlobalISel] convergence control tokens and intrinsics (#67006) [GlobalISel] Implement convergence control tokens and intrinsics in GMIR In the IR translator, convert the LLVM token type to LLT::token(), which is an alias for the s0 type. These show up as implicit uses on convergent operations. Differential Revision: https://reviews.llvm.org/D158147 --- llvm/test/CodeGen/AMDGPU/convergence-tokens.ll | 59 +++++++++++++++++++++----- 1 file changed, 48 insertions(+), 11 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 2ed6d7f..6beccce 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -1,10 +1,12 @@ ; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s +; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL ; CHECK-LABEL: name: basic_call -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY ; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}} ; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] define i32 @basic_call(i32 %src) #0 { %t = call token @llvm.experimental.convergence.entry() %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ] @@ -12,10 +14,11 @@ define i32 @basic_call(i32 %src) #0 { } ; CHECK-LABEL: name: basic_intrinsic -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR ; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]] define i32 @basic_intrinsic(i32 %src) #0 { %t = call token @llvm.experimental.convergence.anchor() %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] @@ -30,12 +33,13 @@ define i32 @uncontrolled_call(i32 %src) #0 { } ; CHECK-LABEL: name: basic_branch -; CHECK: bb.0.entry: -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.then: +; CHECK: bb.[[#]].entry: +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; CHECK: bb.[[#]].then: ; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]] define i32 @basic_branch(i32 %src, i1 %cond) #0 { entry: %t = call token @llvm.experimental.convergence.anchor() @@ -52,12 +56,13 @@ else: } ; CHECK-LABEL: name: basic_loop -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.loop: -; CHECK: [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]] +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; CHECK: bb.[[#]].loop: +; CHECK: [[LOOP:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_LOOP [[TOKEN]] ; ISEL: CONVERGENCECTRL_GLUE [[LOOP]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[LOOP]] define i32 @basic_loop(i32 %src, i1 %cond) #0 { %t1 = call token @llvm.experimental.convergence.anchor() br label %loop @@ -71,6 +76,38 @@ end: ret i32 %r } +; CHECK-LABEL: name: nested +; CHECK: [[ENTRY:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; CHECK: [[ANCHOR:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ANCHOR]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ANCHOR]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ENTRY]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ENTRY]] +define i32 @nested(i32 %src) #0 { + %t1 = call token @llvm.experimental.convergence.entry() + %t2 = call token @llvm.experimental.convergence.anchor() + %r2 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ] + %r1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t1) ] + %sum = add i32 %r1, %r2 + ret i32 %sum +} + +; COM: FIXME: Tokens on tail-call have not been implemented for SelectionDAG +; COM: yet; the corresponding checks have been commented out. +; +; CHECK-LABEL: name: tail_call_void_func_void +; GISEL: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; COM: CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; COM: ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @external_void_func_void, [[TOKEN]], csr_amdgpu, {{.*}} +; COM: DEADMI: {{.*}} SI_CALL {{.*}}, @external_void_func_void, csr_amdgpu, {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, implicit [[TOKEN]] +define void @tail_call_void_func_void() #0 { + %t1 = call token @llvm.experimental.convergence.entry() + tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ] + ret void +} + +declare hidden void @external_void_func_void() #0 declare i32 @foo(i32 %x) #0 declare i32 @llvm.amdgcn.readfirstlane(i32) #0 -- cgit v1.1 From 9b98692eedb78aa106539c36ba02944f32cae1ff Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Mon, 18 Mar 2024 09:09:43 +0100 Subject: [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (#75333) This change allows us to use `--lto-partitions` in some cases (not at all guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen. We must run LowerLDS before splitting modules as it needs to see all callers of functions with LDS to properly lower them. --- llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll new file mode 100644 index 0000000..b813b80 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll @@ -0,0 +1,47 @@ + +; Default O0 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O0 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O1 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O1 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O2 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O2 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O3 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O3 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; First print will be from the New PM during the full LTO pipeline. +; Second print will be from the legacy PM during the CG pipeline. + +; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module] +; CHECK: ModulePass Manager +; CHECK: Lower uses of LDS variables from non-kernel functions + +@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 + +define amdgpu_kernel void @test() { +entry: + store i32 1, ptr addrspace(3) @lds + ret void +} -- cgit v1.1 From 3493438605079c001b554327c02a4432204aab69 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 18 Mar 2024 11:18:57 +0100 Subject: Revert "[AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (#75333)" This reverts commit 9b98692eedb78aa106539c36ba02944f32cae1ff. --- llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll | 47 ------------------------ 1 file changed, 47 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll deleted file mode 100644 index b813b80..0000000 --- a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll +++ /dev/null @@ -1,47 +0,0 @@ - -; Default O0 -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Unified O0 -; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Default O1 -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Unified O1 -; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Default O2 -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Unified O2 -; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Default O3 -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; Unified O3 -; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc -; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s - -; First print will be from the New PM during the full LTO pipeline. -; Second print will be from the legacy PM during the CG pipeline. - -; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module] -; CHECK: ModulePass Manager -; CHECK: Lower uses of LDS variables from non-kernel functions - -@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 - -define amdgpu_kernel void @test() { -entry: - store i32 1, ptr addrspace(3) @lds - ret void -} -- cgit v1.1 From 38a44bdc93db5b00310230f6542df39017b9a41b Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Mon, 18 Mar 2024 18:27:45 +0800 Subject: [CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf (#81572) In commit https://github.com/llvm/llvm-project/commit/2b582440c16c72b6b021ea5c212ceda3bdfb2b9b, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in https://github.com/llvm/llvm-project/pull/76338). This patch reverses the fabs+fcmp to `is.fpclass`. If the `is.fpclass` is not supported by the target, it will be expanded by TLI. Fixes the regression introduced by https://github.com/llvm/llvm-project/commit/2b582440c16c72b6b021ea5c212ceda3bdfb2b9b and https://github.com/llvm/llvm-project/pull/80414#issuecomment-1936374206. --- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 60 ++++++------ llvm/test/CodeGen/AMDGPU/fract-match.ll | 167 ++++++++++++++++---------------- 2 files changed, 118 insertions(+), 109 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 6fa7df9..18d2e52 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0x7f800000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern_f16: @@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0_f16: @@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 3a0b825..e361aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1705,16 +1705,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fffff, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: s_movk_i32 s10, 0x204 +; GFX6-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc -; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v0, s10 +; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, s[8:9] -; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v1, s10 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, vcc +; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, s[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1722,19 +1722,19 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX7-LABEL: safe_math_fract_v2f32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX7-NEXT: v_fract_f32_e32 v6, v0 -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 +; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_floor_f32_e32 v4, v0 ; GFX7-NEXT: v_fract_f32_e32 v7, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_floor_f32_e32 v5, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc ; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1742,15 +1742,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX8-LABEL: safe_math_fract_v2f32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX8-NEXT: v_fract_f32_e32 v6, v0 -; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 +; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX8-NEXT: v_floor_f32_e32 v4, v0 ; GFX8-NEXT: v_fract_f32_e32 v7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX8-NEXT: v_floor_f32_e32 v5, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc ; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1759,14 +1759,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v6, v0 -; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204 ; GFX11-NEXT: v_fract_f32_e32 v7, v1 ; GFX11-NEXT: v_floor_f32_e32 v4, v0 ; GFX11-NEXT: v_floor_f32_e32 v5, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo -; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX11-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) @@ -1937,21 +1938,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1959,21 +1961,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2062,12 +2065,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX6-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX6-NEXT: v_floor_f32_e32 v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_floor_f32_e32 v8, v5 @@ -2080,10 +2083,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX6-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 @@ -2098,12 +2101,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7-NEXT: v_floor_f32_e32 v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_floor_f32_e32 v8, v5 @@ -2116,10 +2119,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -2133,16 +2136,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: s_movk_i32 s6, 0x204 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x204 ; GFX8-NEXT: v_floor_f16_e32 v4, v3 ; GFX8-NEXT: v_floor_f16_e32 v5, v0 ; GFX8-NEXT: v_fract_f16_e32 v6, v3 -; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v3, s6 +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v3, v7 ; GFX8-NEXT: v_pack_b32_f16 v4, v5, v4 ; GFX8-NEXT: v_fract_f16_e32 v5, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, s[4:5] -; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc ; GFX8-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX8-NEXT: global_store_dword v[1:2], v4, off ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2237,19 +2240,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX6-NEXT: s_movk_i32 s10, 0x204 -; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s10 +; GFX6-NEXT: v_mov_b32_e32 v14, 0x204 ; GFX6-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, s[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, s[8:9] -; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[2:3], s10 +; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v14 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, vcc +; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v14 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, s[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, s[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2257,39 +2260,39 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX7-LABEL: safe_math_fract_v2f64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x204 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x204 ; GFX7-NEXT: v_fract_f64_e32 v[10:11], v[0:1] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s4 +; GFX7-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6 ; GFX7-NEXT: v_fract_f64_e32 v[12:13], v[2:3] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[2:3], s4 +; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6 ; GFX7-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s8, s10 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5] +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: safe_math_fract_v2f64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x204 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x204 ; GFX8-NEXT: v_fract_f64_e32 v[10:11], v[0:1] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], s6 +; GFX8-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6 ; GFX8-NEXT: v_fract_f64_e32 v[12:13], v[2:3] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], s6 +; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6 ; GFX8-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5] ; GFX8-NEXT: global_store_dwordx4 v[4:5], v[6:9], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] -- cgit v1.1 From 09bc6abba6e226ad5e9d18d4365690d6f04de21a Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Mon, 18 Mar 2024 10:37:59 -0400 Subject: [MachineFrameInfo] Refactoring around computeMaxcallFrameSize() (NFC) (#78001) - Use computeMaxCallFrameSize() in PEI::calculateCallFrameInfo() instead of duplicating the code. - Set AdjustsStack in FinalizeISel instead of in computeMaxCallFrameSize(). --- llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir | 2 ++ 1 file changed, 2 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir index c1da29e..3228962 100644 --- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir +++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir @@ -14,6 +14,8 @@ --- name: test_av_spill_cross_bb_usage tracksRegLiveness: true +frameInfo: + adjustsStack: true stack: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } machineFunctionInfo: -- cgit v1.1 From 953c13b5c90bed1e24fe95e90137c4e226ac2d09 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Tue, 19 Mar 2024 11:49:22 +0100 Subject: [AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (#84735) Update PromoteAllocaToVector so it considers the whole function before promoting allocas. Allocas are scored & sorted so the highest value ones are seen first. The budget is now per function instead of per alloca. Passed internal performance testing. --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll | 69 ++++++++++++++++++++++ 2 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 5007f77..0ff5dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -195,13 +195,13 @@ ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Expand Atomic instructions -; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: Scalar Evolution Analysis ; GCN-O1-NEXT: Loop Pass Manager @@ -470,9 +470,9 @@ ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Expand Atomic instructions -; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Canonicalize natural loops ; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis ; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis @@ -775,9 +775,9 @@ ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Expand Atomic instructions -; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Straight line strength reduction @@ -1084,9 +1084,9 @@ ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Expand Atomic instructions -; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Straight line strength reduction diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll new file mode 100644 index 0000000..ab03177 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4 +; CHECK-NEXT: => Final Score:1 +; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4 +; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1 +; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4 +; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1 +; CHECK-NEXT: => Final Score:4 +; CHECK-NEXT: Sorted Worklist: +; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +define amdgpu_kernel void @simple_users_scores() #0 { +entry: + ; should get a score of 1 + %simpleuser = alloca [4 x i64], align 4, addrspace(5) + ; should get a score of 4 + %manyusers = alloca [4 x i64], align 4, addrspace(5) + + store i32 42, ptr addrspace(5) %simpleuser + + %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2 + %v0 = load i8, ptr addrspace(5) %manyusers.1 + %v0.ext = zext i8 %v0 to i32 + store i32 %v0.ext, ptr addrspace(5) %manyusers.1 + + %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1 + %v1 = load i8, ptr addrspace(5) %manyusers.2 + %v1.ext = zext i8 %v0 to i32 + store i32 %v1.ext, ptr addrspace(5) %manyusers.2 + + ret void +} + +; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4 +; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4 +; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4 +; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1 +; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4 +; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1 +; CHECK-NEXT: => Final Score:30 +define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 { +entry: + ; should get a score of 1 + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4 + %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + + store i32 42, ptr addrspace(5) %stack + br label %loop.outer + +loop.outer: + store i32 32, ptr addrspace(5) %stack + %outer.cmp = load i1, ptr addrspace(5) %stack.1 + br label %loop.inner + +loop.inner: + store i32 32, ptr addrspace(5) %stack.1 + %inner.cmp = load i1, ptr addrspace(5) %stack.2 + br i1 %inner.cmp, label %loop.inner, label %loop.outer + +exit: + store i32 64, ptr addrspace(5) %stack.2 + ret void +} -- cgit v1.1 From 08701e35ed6f29a2fca095a31f1ebbfe059d6d6e Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Tue, 19 Mar 2024 18:00:34 +0530 Subject: [AMDGPU][NFC] Test clean up. (#85775) Added common check for DPP and Iterative strategies for uniform value case since optimization applied is same. Authored-by: Pravin Jagtap --- .../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 307 +++++++-------------- .../AMDGPU/global_atomics_optimizer_fp_no_rtn.ll | 259 ++++++----------- 2 files changed, 184 insertions(+), 382 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index e3fada3..538ef42 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1,71 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s + +; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. +; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction +; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes. define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP16]] -; IR-ITERATIVE: 16: -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] -; IR-ITERATIVE-NEXT: br label [[TMP24]] -; IR-ITERATIVE: 24: -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP25]] -; -; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-DPP: 14: -; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP16]] -; IR-DPP: 16: -; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) -; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] -; IR-DPP-NEXT: br label [[TMP24]] -; IR-DPP: 24: -; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-DPP-NEXT: ret float [[TMP25]] +; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-NEXT: br label [[TMP24]] +; IR: 24: +; IR-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -411,7 +383,6 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ret float %result } - define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] @@ -514,61 +485,33 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str } define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP12]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) -; IR-ITERATIVE-NEXT: br label [[TMP20]] -; IR-ITERATIVE: 20: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP21]] -; -; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-DPP: 10: -; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP12]] -; IR-DPP: 12: -; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) -; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-DPP-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) -; IR-DPP-NEXT: br label [[TMP20]] -; IR-DPP: 20: -; IR-DPP-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-DPP-NEXT: ret float [[TMP21]] +; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] +; IR-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) +; IR-NEXT: br label [[TMP20]] +; IR: 20: +; IR-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] +; IR-NEXT: ret float [[TMP21]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -1007,159 +950,109 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ret float %result } - define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result } define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } - define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index f87932b..cc7a45c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -1,55 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s + +; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. +; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction +; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes. define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP16]] -; IR-ITERATIVE: 16: -; IR-ITERATIVE-NEXT: br label [[TMP17]] -; IR-ITERATIVE: 17: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-DPP: 14: -; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP16]] -; IR-DPP: 16: -; IR-DPP-NEXT: br label [[TMP17]] -; IR-DPP: 17: -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: br label [[TMP17]] +; IR: 17: +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void @@ -325,7 +305,6 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_stri ret void } - define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] @@ -409,45 +388,25 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri } define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP12]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: br label [[TMP13]] -; IR-ITERATIVE: 13: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-DPP: 10: -; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP12]] -; IR-DPP: 12: -; IR-DPP-NEXT: br label [[TMP13]] -; IR-DPP: 13: -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: br label [[TMP13]] +; IR: 13: +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void @@ -797,159 +756,109 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ret void } - define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void } define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void } define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret void } define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret void } define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } - define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret void } define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret void -- cgit v1.1 From ab76052fa9331f418d7911cafefabd4dd0c1941e Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 19 Mar 2024 09:58:09 -0700 Subject: AMDGPU: Treat SWMMAC the same as MFMA and other WMMA for sched_barrier (#85721) --- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 333 +++++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll new file mode 100644 index 0000000..fdcb177 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s + +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 +; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; GCN-NEXT: ds_load_b128 v[8:11], v0 +; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 +; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 +; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 +; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 +; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 +; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x6 +; GCN-NEXT: v_mov_b32_e32 v31, v11 +; GCN-NEXT: s_wait_dscnt 0x5 +; GCN-NEXT: v_mov_b32_e32 v35, v15 +; GCN-NEXT: s_wait_dscnt 0x4 +; GCN-NEXT: v_mov_b32_e32 v39, v19 +; GCN-NEXT: s_wait_dscnt 0x3 +; GCN-NEXT: v_mov_b32_e32 v43, v23 +; GCN-NEXT: s_wait_dscnt 0x2 +; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 +; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 +; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 +; GCN-NEXT: v_mov_b32_e32 v32, v12 +; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 +; GCN-NEXT: v_mov_b32_e32 v36, v16 +; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 +; GCN-NEXT: v_mov_b32_e32 v40, v20 +; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 +; GCN-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 +; GCN-NEXT: ds_store_b128 v49, v[28:31] +; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 +; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 +; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 +; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 +; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 +; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 +; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr + ; 7 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) + ; 5 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) + ; 5 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) + ret void +} + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[1:4], v17 +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x2 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr + ; 3 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ret void +} -- cgit v1.1 From 4a026b5092d77426b70ab299447af4dbd5a012d9 Mon Sep 17 00:00:00 2001 From: Peter Rong Date: Tue, 19 Mar 2024 21:44:08 -0700 Subject: [AMDGCN] Use ZExt when handling indices in insertment element (#85718) When i1 true is used as an index, SExt extends it to i32 -1. This would cause BitVector to overflow. The language manual have specified that the index shall be treated as an unsigned number, this patch fixes that. (https://llvm.org/docs/LangRef.html#insertelement-instruction) This patch fixes #85717 --------- Signed-off-by: Peter Rong --- .../amdgpu-codegenprepare-break-large-phis.ll | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll index 192bf7c..93b9aea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll @@ -1197,3 +1197,54 @@ reallyfinally: store <5 x double> %val, ptr %out, align 1 ret void } + +define amdgpu_kernel void @pr85718(i1 %Bool, ptr %Ptr, <4 x float> %Vec1, <4 x float> %Vec2) { +; OPT-LABEL: @pr85718( +; OPT-NEXT: BB0: +; OPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[TMP0:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE0:%.*]], [[BB2:%.*]] ], [ [[LARGEPHI_EXTRACTSLICE1:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0:%.*]] ] +; OPT-NEXT: [[TMP1:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE3:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE4:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[TMP2:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE6:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE7:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[TMP3:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE9:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE10:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE0]], float [[TMP1]], i64 1 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE1]], float [[TMP2]], i64 2 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE2]], float [[TMP3]], i64 3 +; OPT-NEXT: store <4 x float> [[LARGEPHI_INSERTSLICE3]], ptr [[PTR:%.*]], align 128 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE1]] = extractelement <4 x float> [[VEC2:%.*]], i64 0 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE4]] = extractelement <4 x float> [[VEC2]], i64 1 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE7]] = extractelement <4 x float> [[VEC2]], i64 2 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE10]] = extractelement <4 x float> [[VEC2]], i64 3 +; OPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]] +; OPT: BB2: +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0]] = extractelement <4 x float> [[I]], i64 0 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE3]] = extractelement <4 x float> [[I]], i64 1 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE6]] = extractelement <4 x float> [[I]], i64 2 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE9]] = extractelement <4 x float> [[I]], i64 3 +; OPT-NEXT: br label [[BB1]] +; +; NOOPT-LABEL: @pr85718( +; NOOPT-NEXT: BB0: +; NOOPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true +; NOOPT-NEXT: br label [[BB1:%.*]] +; NOOPT: BB1: +; NOOPT-NEXT: [[PHI:%.*]] = phi <4 x float> [ [[I]], [[BB2:%.*]] ], [ [[VEC2:%.*]], [[BB1]] ], [ zeroinitializer, [[BB0:%.*]] ] +; NOOPT-NEXT: store <4 x float> [[PHI]], ptr [[PTR:%.*]], align 128 +; NOOPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]] +; NOOPT: BB2: +; NOOPT-NEXT: br label [[BB1]] +; +BB0: + %I = insertelement <4 x float> %Vec1, float 4.200000e+01, i1 true + br label %BB1 + +BB1: ; preds = %BB0, %BB1, %BB2 + %PHI = phi <4 x float> [ %I, %BB2 ], [ %Vec2, %BB1 ], [ zeroinitializer, %BB0 ] + store <4 x float> %PHI, ptr %Ptr, align 128 + br i1 %Bool, label %BB1, label %BB2 + +BB2: ; preds = %BB1 + br label %BB1 +} -- cgit v1.1 From 070d1e83213f0e9eb71582141dd2323131a307e3 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Wed, 20 Mar 2024 14:45:38 +0530 Subject: [AMDGPU] Add test for fpext & fptrunc with bf16. (#85909) Authored-by: Pravin Jagtap --- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 344 +++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll new file mode 100644 index 0000000..1eb2771 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s + +; TODO: Add global-isel when it can support bf16 +define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { +; GCN-LABEL: v_test_cvt_bf16_f32_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: ; return to shader part epilog + %cvt = fpext bfloat %v to float + ret float %cvt +} +define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { +; GCN-LABEL: v_test_cvt_bf16_f32_s: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %cvt = fpext bfloat %v to float + ret float %cvt +} +define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { +; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v2, v2, v0, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GCN-NEXT: v_add3_u32 v2, v2, v1, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x float> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} +define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { +; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_u32 s2, s1, 0x10010 +; GCN-NEXT: s_add_i32 s2, s2, s1 +; GCN-NEXT: s_or_b32 s4, s1, 0x400000 +; GCN-NEXT: s_add_i32 s5, s2, 0x7fff +; GCN-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, s4, s5 +; GCN-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GCN-NEXT: s_add_i32 s1, s1, s0 +; GCN-NEXT: s_or_b32 s3, s0, 0x400000 +; GCN-NEXT: s_add_i32 s4, s1, 0x7fff +; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s3, s4 +; GCN-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x float> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} +define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { +; GCN-LABEL: v_test_cvt_f32_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: ; return to shader part epilog + %trunc = fptrunc float %src to bfloat + %ext = fpext bfloat %trunc to float + ret float %ext +} +define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { +; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v6 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v6, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: v_and_or_b32 v5, v1, s4, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s5, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s5 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 +; GCN-NEXT: v_and_b32_e32 v6, 1, v5 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v0, v5, v0 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_and_or_b32 v1, v3, s4, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 1 +; GCN-NEXT: v_add3_u32 v0, v0, v1, s5 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_perm_b32 v0, v0, v4, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x double> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} +define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { +; GCN-LABEL: fptrunc_f32_f32_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v2, v2, v0, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GCN-NEXT: v_add3_u32 v2, v2, v1, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %a.cvt = fptrunc float %a to bfloat + %b.cvt = fptrunc float %b to bfloat + %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 + %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 + %ret = bitcast <2 x bfloat> %v2.2 to float + ret float %ret +} +define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { +; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v3, v3, v2, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GCN-NEXT: v_add3_u32 v3, v3, v2, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %a.neg = fneg float %a + %a.cvt = fptrunc float %a.neg to bfloat + %b.abs = call float @llvm.fabs.f32(float %b) + %b.cvt = fptrunc float %b.abs to bfloat + %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 + %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 + %ret = bitcast <2 x bfloat> %v2.2 to float + ret float %ret +} +define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 +; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.cvt = fptrunc float %a to bfloat + store bfloat %a.cvt, ptr %out + ret void +} +define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16_abs: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.abs = call float @llvm.fabs.f32(float %a) + %a.cvt = fptrunc float %a.abs to bfloat + store bfloat %a.cvt, ptr %out + ret void +} +define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16_neg: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.neg = fneg float %a + %a.cvt = fptrunc float %a.neg to bfloat + store bfloat %a.cvt, ptr %out + ret void +} +define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v6 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v6, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: s_brev_b32 s0, 1 +; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.cvt = fptrunc double %a to bfloat + store bfloat %a.cvt, ptr %out + ret void +} +define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16_neg: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GCN-NEXT: v_and_b32_e32 v8, 1, v7 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v7, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.neg = fneg double %a + %a.cvt = fptrunc double %a.neg to bfloat + store bfloat %a.cvt, ptr %out + ret void +} +define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16_abs: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GCN-NEXT: v_and_b32_e32 v8, 1, v7 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v7, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: s_brev_b32 s0, 1 +; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.abs = call double @llvm.fabs.f64(double %a) + %a.cvt = fptrunc double %a.abs to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +declare float @llvm.fabs.f32(float) +declare double @llvm.fabs.f64(double) + -- cgit v1.1 From e52a68787122f8f17f2923818b98f3cea4e881a1 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Wed, 20 Mar 2024 17:29:42 +0530 Subject: [AMDGPU][NFC] Test clean up (#85922) Authored-by: Pravin Jagtap --- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 1eb2771..7108f3d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s ; TODO: Add global-isel when it can support bf16 + define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { ; GCN-LABEL: v_test_cvt_bf16_f32_v: ; GCN: ; %bb.0: @@ -10,6 +11,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { %cvt = fpext bfloat %v to float ret float %cvt } + define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { ; GCN-LABEL: v_test_cvt_bf16_f32_s: ; GCN: ; %bb.0: @@ -19,6 +21,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { %cvt = fpext bfloat %v to float ret float %cvt } + define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { ; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v: ; GCN: ; %bb.0: @@ -42,6 +45,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { %cast = bitcast <2 x bfloat> %res to float ret float %cast } + define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { ; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s: ; GCN: ; %bb.0: @@ -66,6 +70,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { %cast = bitcast <2 x bfloat> %res to float ret float %cast } + define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { ; GCN-LABEL: v_test_cvt_f32_bf16_v: ; GCN: ; %bb.0: @@ -82,6 +87,7 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { %ext = fpext bfloat %trunc to float ret float %ext } + define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GCN: ; %bb.0: @@ -128,6 +134,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { %cast = bitcast <2 x bfloat> %res to float ret float %cast } + define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { ; GCN-LABEL: fptrunc_f32_f32_to_v2bf16: ; GCN: ; %bb.0: ; %entry @@ -155,6 +162,7 @@ entry: %ret = bitcast <2 x bfloat> %v2.2 to float ret float %ret } + define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { ; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods: ; GCN: ; %bb.0: ; %entry @@ -186,6 +194,7 @@ entry: %ret = bitcast <2 x bfloat> %v2.2 to float ret float %ret } + define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { ; GCN-LABEL: fptrunc_f32_to_bf16: ; GCN: ; %bb.0: ; %entry @@ -205,6 +214,7 @@ entry: store bfloat %a.cvt, ptr %out ret void } + define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { ; GCN-LABEL: fptrunc_f32_to_bf16_abs: ; GCN: ; %bb.0: ; %entry @@ -226,6 +236,7 @@ entry: store bfloat %a.cvt, ptr %out ret void } + define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { ; GCN-LABEL: fptrunc_f32_to_bf16_neg: ; GCN: ; %bb.0: ; %entry @@ -247,6 +258,7 @@ entry: store bfloat %a.cvt, ptr %out ret void } + define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GCN-LABEL: fptrunc_f64_to_bf16: ; GCN: ; %bb.0: ; %entry @@ -276,6 +288,7 @@ entry: store bfloat %a.cvt, ptr %out ret void } + define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GCN-LABEL: fptrunc_f64_to_bf16_neg: ; GCN: ; %bb.0: ; %entry @@ -307,6 +320,7 @@ entry: store bfloat %a.cvt, ptr %out ret void } + define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GCN-LABEL: fptrunc_f64_to_bf16_abs: ; GCN: ; %bb.0: ; %entry @@ -341,4 +355,3 @@ entry: declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) - -- cgit v1.1 From 05bde30585710a51592eee0a6cf6df8184d09c92 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 20 Mar 2024 10:29:12 -0400 Subject: Move assertion for AdjustsStack from PEI to MachineVerifier. (#85698) Have the verifier report a missing AdjustsStack flag rather than waiting until PEI asserts. --- llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir | 2 ++ llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir | 1 + llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir | 1 + llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir | 2 ++ llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir | 1 + llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir | 1 + llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir | 1 + 7 files changed, 9 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir index 3616d61..054eeec 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir @@ -7,6 +7,8 @@ --- name: restore_undef_copy_use +frameInfo: + adjustsStack: true tracksRegLiveness: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index bdd89a9..dde84af 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -13,6 +13,7 @@ name: greedy_fail_alloc_sgpr1024_spill tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: explicitKernArgSize: 16 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 2ccc241..fdfc9b0 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -24,6 +24,7 @@ registers: - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' } frameInfo: maxAlignment: 1 + adjustsStack: true hasCalls: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index c0d1999..158874e 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -180,6 +180,8 @@ exposesReturnsTwice: false legalized: false regBankSelected: false selected: false +frameInfo: + adjustsStack: true tracksRegLiveness: true liveins: - { reg: '$vgpr0', virtual-reg: '%0' } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index efbdbca..c6ccbd9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -78,6 +78,7 @@ name: sgpr_spill_wrong_stack_id tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index 3558298..f8ec6bb 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -21,6 +21,7 @@ name: kernel tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 3d9db68..6659e95 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -20,6 +20,7 @@ name: undef_identity_copy tracksRegLiveness: true frameInfo: maxAlignment: 4 + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true -- cgit v1.1 From 9ebd329ad87ca4cde3ce62e1bf5612c4fc0fcb7f Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 20 Mar 2024 11:46:19 -0400 Subject: Revert "Move assertion for AdjustsStack from PEI to MachineVerifier. (#85698)" This reverts commit 05bde30585710a51592eee0a6cf6df8184d09c92. Reverting due to verifier complaints with expensive checks on build-bot. --- llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir | 2 -- llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir | 1 - llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir | 1 - llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir | 2 -- llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir | 1 - llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir | 1 - llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir | 1 - 7 files changed, 9 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir index 054eeec..3616d61 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir @@ -7,8 +7,6 @@ --- name: restore_undef_copy_use -frameInfo: - adjustsStack: true tracksRegLiveness: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index dde84af..bdd89a9 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -13,7 +13,6 @@ name: greedy_fail_alloc_sgpr1024_spill tracksRegLiveness: true frameInfo: - adjustsStack: true hasCalls: true machineFunctionInfo: explicitKernArgSize: 16 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index fdfc9b0..2ccc241 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -24,7 +24,6 @@ registers: - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' } frameInfo: maxAlignment: 1 - adjustsStack: true hasCalls: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index 158874e..c0d1999 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -180,8 +180,6 @@ exposesReturnsTwice: false legalized: false regBankSelected: false selected: false -frameInfo: - adjustsStack: true tracksRegLiveness: true liveins: - { reg: '$vgpr0', virtual-reg: '%0' } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index c6ccbd9..efbdbca 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -78,7 +78,6 @@ name: sgpr_spill_wrong_stack_id tracksRegLiveness: true frameInfo: - adjustsStack: true hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index f8ec6bb..3558298 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -21,7 +21,6 @@ name: kernel tracksRegLiveness: true frameInfo: - adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 6659e95..3d9db68 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -20,7 +20,6 @@ name: undef_identity_copy tracksRegLiveness: true frameInfo: maxAlignment: 4 - adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true -- cgit v1.1 From deefe3fbc93b3bdc77fbaf718403a45dae983d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Thu, 21 Mar 2024 03:56:40 +0100 Subject: [GlobalIsel] Post-review combine ADDO (#85961) https://github.com/llvm/llvm-project/pull/82927 --- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index b2311a8..a69418d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -238,7 +238,7 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc @@ -612,7 +612,7 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc @@ -978,7 +978,7 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc @@ -1338,7 +1338,7 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -- cgit v1.1 From b6b703b2dfc1d1ba45ebc64ed6b53a3a46f531f5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 21 Mar 2024 14:24:06 +0530 Subject: AMDGPU: Infer no-agpr usage in AMDGPUAttributor (#85948) SIMachineFunctionInfo has a scan of the function body for inline asm which may use AGPRs, or callees in SIMachineFunctionInfo. Move this into the attributor, so it actually works interprocedurally. Could probably avoid most of the test churn if this bothered to avoid adding this on subtargets without AGPRs. We should also probably try to delete the MIR scan in usesAGPRs but it seems to be trickier to eliminate. --- .../CodeGen/AMDGPU/addrspacecast-constantexpr.ll | 6 +- .../CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll | 255 +++++++++++++++++++++ .../AMDGPU/annotate-kernel-features-hsa-call.ll | 44 ++-- .../CodeGen/AMDGPU/annotate-kernel-features-hsa.ll | 26 +-- .../CodeGen/AMDGPU/annotate-kernel-features.ll | 19 +- .../AMDGPU/copy-vgpr-clobber-spill-vgpr.mir | 2 +- llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll | 2 +- .../CodeGen/AMDGPU/duplicate-attribute-indirect.ll | 2 +- .../AMDGPU/implicitarg-offset-attributes.ll | 30 +-- .../CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll | 20 +- .../AMDGPU/propagate-flat-work-group-size.ll | 18 +- llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll | 44 ++-- .../CodeGen/AMDGPU/recursive_global_initializer.ll | 2 +- .../AMDGPU/remove-no-kernel-id-attribute.ll | 8 +- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll | 2 +- .../AMDGPU/uniform-work-group-attribute-missing.ll | 4 +- .../CodeGen/AMDGPU/uniform-work-group-multistep.ll | 4 +- .../uniform-work-group-nested-function-calls.ll | 4 +- ...orm-work-group-prevent-attribute-propagation.ll | 6 +- .../uniform-work-group-propagate-attribute.ll | 4 +- .../AMDGPU/uniform-work-group-recursion-test.ll | 6 +- .../test/CodeGen/AMDGPU/uniform-work-group-test.ll | 4 +- llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll | 1 + 23 files changed, 385 insertions(+), 128 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 66034af..cff9ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -233,9 +233,9 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll new file mode 100644 index 0000000..33b1cc6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +define amdgpu_kernel void @kernel_uses_asm_virtreg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_def() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[DEF:%.*]] = call i32 asm sideeffect " +; CHECK-NEXT: ret void +; + %def = call i32 asm sideeffect "; def $0", "=a"() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[DEF:%.*]] = call i64 asm sideeffect " +; CHECK-NEXT: ret void +; + %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_non_agpr_asm() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "v"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a0}"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + ret void +} + +define void @func_uses_asm_virtreg_agpr() { +; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} + +define void @func_uses_asm_physreg_agpr() { +; CHECK-LABEL: define void @func_uses_asm_physreg_agpr( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a0}"(i32 poison) + ret void +} + +define void @func_uses_asm_physreg_agpr_tuple() { +; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + ret void +} + +declare void @unknown() + +define amdgpu_kernel void @kernel_calls_extern() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret void +; + call void @unknown() + ret void +} + +define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]] +; CHECK-NEXT: ret void +; + call void @unknown() #0 + ret void +} + +define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect( +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: call void [[INDIRECT]]() +; CHECK-NEXT: ret void +; + call void %indirect() + ret void +} + +define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]] +; CHECK-NEXT: ret void +; + call void %indirect() #0 + ret void +} + +define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: ret void +; + call void @func_uses_asm_physreg_agpr() + ret void +} + +define void @empty() { +; CHECK-LABEL: define void @empty( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @also_empty() { +; CHECK-LABEL: define void @also_empty( +; CHECK-SAME: ) #[[ATTR5]] { +; CHECK-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kernel_calls_empty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @empty() +; CHECK-NEXT: ret void +; + call void @empty() + ret void +} + +define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @empty() +; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: ret void +; + call void @empty() + call void @func_uses_asm_physreg_agpr() + ret void +} + +define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false) + ret void +} + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) + +define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0) +; CHECK-NEXT: store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128 +; CHECK-NEXT: ret void +; + %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) + store <32 x float> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %result = call i32 @llvm.amdgcn.workitem.id.x() + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr( +; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty +; CHECK-NEXT: call void [[FPTR]]() +; CHECK-NEXT: ret void +; + %fptr = select i1 %cond, ptr @empty, ptr @also_empty + call void %fptr() + ret void +} + + +attributes #0 = { "amdgpu-no-agpr" } +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index af0eb23..3d4ae84d9 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1025,33 +1025,33 @@ attributes #6 = { "enqueued-block" } ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind } ; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 9a9c28a..43cdf85 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -643,19 +643,19 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 6c5e58c..547ff69 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -393,17 +393,18 @@ define amdgpu_kernel void @use_get_local_size_z(ptr addrspace(1) %ptr) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;. ; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir index 895185c..577d38e 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir @@ -333,7 +333,7 @@ ret void } - attributes #0 = { "amdgpu-waves-per-eu"="4,4" } + attributes #0 = { "amdgpu-waves-per-eu"="4,4" "amdgpu-no-agpr" } ... --- diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 0c03419..386f9cd 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 0069370..05558c5 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll index a5792bf..4c21f87 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ;. ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V4: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V5: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V6: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll index e7488e0..20edbd6 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll @@ -157,27 +157,27 @@ define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR2]] { +; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-16-NEXT: ret void ; ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-20-NEXT: ret void ; @@ -235,23 +235,23 @@ define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-16-NEXT: ret void ; ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-20-NEXT: ret void ; ret void diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index d92ba77..d070dc3 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" } attributes #6 = { "amdgpu-flat-work-group-size"="512,512" } attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 2df219b..f62f1d5 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" } attributes #18 = { "amdgpu-waves-per-eu"="9,10" } attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll index eaef63b..c1d647c 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll @@ -19,5 +19,5 @@ define void @hoge() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 297a056..384a9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() { !1 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index f229f33..539cfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -73,7 +73,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll index 8d5dc79..049db01 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index 7a6f82d..c9387f1 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 { attributes #0 = { "uniform-work-group-size"="true" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll index c04154c..7183da2 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 { attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll index 2d5ff04..6ed04cf 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 { attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll index e8bf6fc..d5ba2fd 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -52,8 +52,8 @@ attributes #0 = { nounwind } attributes #1 = { "uniform-work-group-size"="false" } attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index 473eea4..7f0dfea 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 { attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll index 221f1a1..8616c73 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 { attributes #0 = { "uniform-work-group-size"="false" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll index 717d3d9..0407994 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll @@ -540,6 +540,7 @@ define internal void @use512vgprs() { } define void @foo() #0 { + call void asm sideeffect "; use $0", "a"(i32 0) ret void } -- cgit v1.1 From ccb3a8feaa5b132dc829e55e069dde62008df4a8 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 21 Mar 2024 11:28:35 +0100 Subject: [AMDGPU][LowerModuleLDS] Refactor partially lowered module detection (#85793) Refactor the logic that checks if a module contains mixed absolute/non-lowered LDS GVs. The check now happens latter when the "worklists" are formed. This is because in some cases (OpenMP) we can have non-lowered GVs in a lowered module, and this is normal because those GVs are just unused and removed from the list at some point before the end of `getUsesOfLDSByFunction`. Doing the check later ensures that if a mixed module is spotted, then it's a _real_ mixed module that needs rejection, not a module containing an intentionally ignored GV. --- .../AMDGPU/lds-mixed-absolute-addresses-unused.ll | 26 ++++++++++++++++++++++ .../AMDGPU/lds-reject-mixed-absolute-addresses.ll | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll new file mode 100644 index 0000000..d101d8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s + +; This looks like a partially lowered module, but the non-lowered GV isn't used by any kernels. +; In such cases, LowerModuleLDS is free to leave it in and ignore it, and we want to make sure +; LowerModuleLDS doesn't crash if it re-runs on such modules. +@notLowered = addrspace(3) global i32 poison +@lowered = addrspace(3) global i32 poison, !absolute_symbol !0 + +@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @notLowered to ptr)], section "llvm.metadata" + +define amdgpu_kernel void @kern(i32 %val0) { +; CHECK-LABEL: define amdgpu_kernel void @kern( +; CHECK-SAME: i32 [[VAL0:%.*]]) { +; CHECK-NEXT: [[VAL1:%.*]] = add i32 [[VAL0]], 4 +; CHECK-NEXT: store i32 [[VAL1]], ptr addrspace(3) @lowered, align 4 +; CHECK-NEXT: ret void +; + %val1 = add i32 %val0, 4 + store i32 %val1, ptr addrspace(3) @lowered + ret void +} + + +!0 = !{i32 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll index b512a43..b1f4f2e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @kern() { %val0 = load i32, ptr addrspace(3) @var1 %val1 = add i32 %val0, 4 - store i32 %val1, ptr addrspace(3) @var1 + store i32 %val1, ptr addrspace(3) @var2 ret void } -- cgit v1.1 From 95a834a16c3de0de615d0cfa20a6c8bd973b6a1d Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 21 Mar 2024 11:44:47 +0100 Subject: (Reland) [AMDGPU] Run LowerLDS at the end of the fullLTO pipeline (#85626) Reland of #75333 --- llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll new file mode 100644 index 0000000..f1d9463 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll @@ -0,0 +1,47 @@ + +; Default O0 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O0 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O1 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O1 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O2 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O2 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O3 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O3 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; First print will be from the New PM during the full LTO pipeline. +; Second print will be from the legacy PM during the CG pipeline. + +; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module] +; CHECK: ModulePass Manager +; CHECK: Lower uses of LDS variables from non-kernel functions + +@lds = internal unnamed_addr addrspace(3) global i32 poison, align 4 + +define amdgpu_kernel void @test() { +entry: + store i32 1, ptr addrspace(3) @lds + ret void +} -- cgit v1.1 From 3ac243bc0d7922d083af2cf025247b5698556062 Mon Sep 17 00:00:00 2001 From: SahilPatidar Date: Thu, 21 Mar 2024 16:52:08 +0530 Subject: Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226 (#81394) Resolve #78226 --- .../GlobalISel/irtranslator-call-non-fixed.ll | 10 +- .../CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 2 +- .../test/CodeGen/AMDGPU/combine_andor_with_cmps.ll | 24 +- llvm/test/CodeGen/AMDGPU/function-args-inreg.ll | 133 + .../CodeGen/AMDGPU/gfx-callable-argument-types.ll | 4371 +++++++++----------- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 60 +- llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll | 2 +- llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll | 4 +- 9 files changed, 2060 insertions(+), 2556 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll index 5effd24..fad833c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 392b0ae..7567060 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 9865883..e369f7e 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX11-LABEL: test_inreg_arg_store: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] store bfloat %in, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index 10d71a3..e1e3220 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test34: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s4, s5 +; GCN-NEXT: s_min_i32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test35: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s4, s5 +; GCN-NEXT: s_max_i32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test36: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s4, s5 +; GCN-NEXT: s_min_u32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_lt_u32 s0, s6 +; GCN-NEXT: s_cmp_lt_u32 s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test37: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s4, s5 +; GCN-NEXT: s_max_i32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_i32 s0, s6 +; GCN-NEXT: s_cmp_ge_i32 s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test38: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_u32 s0, s4, s5 +; GCN-NEXT: s_max_u32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test39: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s4, s5 +; GCN-NEXT: s_min_i32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test40: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s4, s5 +; GCN-NEXT: s_max_i32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_le_i32 s0, s6 +; GCN-NEXT: s_cmp_le_i32 s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test41: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s4, s5 +; GCN-NEXT: s_min_u32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_u32 s0, s6 +; GCN-NEXT: s_cmp_ge_u32 s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 44a9127..27845b6 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -2176,6 +2176,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) declare void @extern() define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) { +; GFX9-LABEL: void_func_a13i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s27, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[28:29] +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 +; GFX9-NEXT: v_mov_b32_e32 v5, s25 +; GFX9-NEXT: v_mov_b32_e32 v4, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 +; GFX9-NEXT: v_writelane_b32 v40, s27, 2 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_a13i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s23, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s24, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s24 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: s_getpc_b64 s[18:19] +; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s23, 2 +; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store [13 x i32] %arg0, ptr addrspace(1) %ptr call void @extern() ret void @@ -2203,6 +2290,52 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; FIXME: Should still fail define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) { +; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14 +; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12 +; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10 +; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8 +; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 +; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off +; GFX11-NEXT: s_setpc_b64 s[30:31] store [16 x i32] %arg0, ptr addrspace(1) %ptr ret void } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index a118fa3..3e1db5f 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9567,19 +9567,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_movk_i32 s0, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9597,19 +9595,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo +; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9627,20 +9623,18 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo +; GFX11-NEXT: s_movk_i32 s0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9658,19 +9652,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9692,19 +9684,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_movk_i32 s0, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9722,19 +9712,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo +; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9752,20 +9740,18 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo +; GFX11-NEXT: s_movk_i32 s0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9783,19 +9769,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9817,19 +9801,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 42 +; GFX9-NEXT: s_mov_b32 s0, 42 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9847,19 +9829,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9877,20 +9857,18 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 42 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9908,19 +9886,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9942,22 +9918,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s4, 0x7b -; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9975,22 +9947,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo +; GFX10-NEXT: s_movk_i32 s0, 0x7b +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10008,23 +9976,19 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo +; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10042,22 +10006,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10079,26 +10039,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10116,26 +10073,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10153,27 +10107,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10191,26 +10137,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10233,28 +10171,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10272,28 +10202,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1 +; GFX10-NEXT: s_mov_b32 s1, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s2, 3 +; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10311,29 +10233,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10351,28 +10265,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10394,32 +10300,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 8 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v40, s30, 6 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s8, 1 -; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 7 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 7 -; GFX9-NEXT: v_readlane_b32 s30, v40, 6 -; GFX9-NEXT: v_readlane_b32 s9, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 8 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10437,32 +10340,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 8 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 1 -; GFX10-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-NEXT: s_mov_b32 s9, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 6 -; GFX10-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 7 -; GFX10-NEXT: v_readlane_b32 s30, v40, 6 -; GFX10-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 8 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10480,33 +10380,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 8 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 1 -; GFX11-NEXT: v_writelane_b32 v40, s9, 5 -; GFX11-NEXT: s_mov_b32 s9, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 6 -; GFX11-NEXT: v_writelane_b32 v40, s31, 7 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 7 -; GFX11-NEXT: v_readlane_b32 s30, v40, 6 -; GFX11-NEXT: v_readlane_b32 s9, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 8 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10524,32 +10416,24 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10574,38 +10458,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v40, s10, 6 -; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: v_writelane_b32 v40, s30, 8 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s8, 1 -; GFX9-NEXT: s_mov_b32 s9, 2 -; GFX9-NEXT: s_mov_b32 s10, 3 -; GFX9-NEXT: s_mov_b32 s11, 4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 9 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 -; GFX9-NEXT: v_readlane_b32 s30, v40, 8 -; GFX9-NEXT: v_readlane_b32 s11, v40, 7 -; GFX9-NEXT: v_readlane_b32 s10, v40, 6 -; GFX9-NEXT: v_readlane_b32 s9, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10623,38 +10504,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 1 -; GFX10-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-NEXT: s_mov_b32 s9, 2 -; GFX10-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-NEXT: s_mov_b32 s10, 3 -; GFX10-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-NEXT: s_mov_b32 s11, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10672,39 +10550,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 1 -; GFX11-NEXT: v_writelane_b32 v40, s9, 5 -; GFX11-NEXT: s_mov_b32 s9, 2 -; GFX11-NEXT: v_writelane_b32 v40, s10, 6 -; GFX11-NEXT: s_mov_b32 s10, 3 -; GFX11-NEXT: v_writelane_b32 v40, s11, 7 -; GFX11-NEXT: s_mov_b32 s11, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 8 -; GFX11-NEXT: v_writelane_b32 v40, s31, 9 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 -; GFX11-NEXT: v_readlane_b32 s30, v40, 8 -; GFX11-NEXT: v_readlane_b32 s11, v40, 7 -; GFX11-NEXT: v_readlane_b32 s10, v40, 6 -; GFX11-NEXT: v_readlane_b32 s9, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10722,38 +10592,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10777,19 +10639,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s4, 0x4400 +; GFX9-NEXT: s_movk_i32 s0, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10807,19 +10667,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo +; GFX10-NEXT: s_movk_i32 s0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10837,20 +10695,18 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo +; GFX11-NEXT: s_movk_i32 s0, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x4400 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10868,19 +10724,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10902,19 +10756,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 4.0 +; GFX9-NEXT: s_mov_b32 s0, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10932,19 +10784,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10962,20 +10812,18 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 4.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10993,19 +10841,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11027,22 +10873,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1.0 -; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s0, 1.0 +; GFX9-NEXT: s_mov_b32 s1, 2.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11060,22 +10902,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1.0 +; GFX10-NEXT: s_mov_b32 s1, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11093,23 +10931,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1.0 +; GFX11-NEXT: s_mov_b32 s1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11127,22 +10961,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11164,25 +10994,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 5 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1.0 -; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: s_mov_b32 s0, 1.0 +; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 4 -; GFX9-NEXT: v_readlane_b32 s30, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 5 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11200,25 +11024,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 5 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1.0 +; GFX10-NEXT: s_mov_b32 s1, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 4.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 3 -; GFX10-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 4 -; GFX10-NEXT: v_readlane_b32 s30, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 5 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11236,26 +11054,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 5 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1.0 +; GFX11-NEXT: s_mov_b32 s1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 4.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 3 -; GFX11-NEXT: v_writelane_b32 v40, s31, 4 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 4 -; GFX11-NEXT: v_readlane_b32 s30, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 5 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11273,25 +11085,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11313,31 +11119,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 7 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s30, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1.0 -; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: s_mov_b32 s6, 4.0 -; GFX9-NEXT: s_mov_b32 s7, -1.0 -; GFX9-NEXT: s_mov_b32 s8, 0.5 +; GFX9-NEXT: s_mov_b32 s0, 1.0 +; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 4.0 +; GFX9-NEXT: s_mov_b32 s3, -1.0 +; GFX9-NEXT: s_mov_b32 s4, 0.5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 6 -; GFX9-NEXT: v_readlane_b32 s30, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 7 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11355,31 +11153,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 7 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s0, 1.0 +; GFX10-NEXT: s_mov_b32 s1, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 4.0 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 0.5 -; GFX10-NEXT: v_writelane_b32 v40, s30, 5 -; GFX10-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-NEXT: s_mov_b32 s2, 4.0 +; GFX10-NEXT: s_mov_b32 s3, -1.0 +; GFX10-NEXT: s_mov_b32 s4, 0.5 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 6 -; GFX10-NEXT: v_readlane_b32 s30, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 7 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11397,32 +11187,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 7 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1.0 +; GFX11-NEXT: s_mov_b32 s1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 4.0 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, -1.0 -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 0.5 -; GFX11-NEXT: v_writelane_b32 v40, s30, 5 -; GFX11-NEXT: v_writelane_b32 v40, s31, 6 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s2, 4.0 +; GFX11-NEXT: s_mov_b32 s3, -1.0 +; GFX11-NEXT: s_mov_b32 s4, 0.5 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 6 -; GFX11-NEXT: v_readlane_b32 s30, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 7 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11440,31 +11222,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, -1.0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0.5 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11486,22 +11260,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40100000 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11519,22 +11289,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s1, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11552,23 +11318,19 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s1, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11586,22 +11348,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11623,28 +11381,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s3, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11662,28 +11412,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s1, 2.0 +; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 0 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11701,29 +11443,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11741,28 +11475,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11784,34 +11510,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 8 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v40, s30, 6 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0x40100000 ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_mov_b32 s7, 0x40100000 -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, 0x40200000 +; GFX9-NEXT: s_mov_b32 s5, 0x40200000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 7 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 7 -; GFX9-NEXT: v_readlane_b32 s30, v40, 6 -; GFX9-NEXT: v_readlane_b32 s9, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 8 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11829,34 +11547,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 8 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s1, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 0 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-NEXT: s_mov_b32 s9, 0x40200000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 6 -; GFX10-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-NEXT: s_mov_b32 s5, 0x40200000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 7 -; GFX10-NEXT: v_readlane_b32 s30, v40, 6 -; GFX10-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 8 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11874,35 +11584,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 8 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x40100000 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: v_writelane_b32 v40, s9, 5 -; GFX11-NEXT: s_mov_b32 s9, 0x40200000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 6 -; GFX11-NEXT: v_writelane_b32 v40, s31, 7 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s5, 0x40200000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 7 -; GFX11-NEXT: v_readlane_b32 s30, v40, 6 -; GFX11-NEXT: v_readlane_b32 s9, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 8 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11920,34 +11622,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40200000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11969,19 +11663,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11999,19 +11691,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12029,20 +11719,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12060,19 +11748,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12095,21 +11781,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s1, s35 +; GFX9-NEXT: s_mov_b32 s0, s34 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12127,21 +11812,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s1, s35 +; GFX10-NEXT: s_mov_b32 s0, s34 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12159,22 +11843,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12192,21 +11872,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12229,21 +11905,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s1, s35 +; GFX9-NEXT: s_mov_b32 s0, s34 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12261,21 +11936,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s1, s35 +; GFX10-NEXT: s_mov_b32 s0, s34 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12293,22 +11967,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12326,21 +11996,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12363,22 +12029,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 0x20001 -; GFX9-NEXT: s_mov_b32 s5, 3 +; GFX9-NEXT: s_mov_b32 s0, 0x20001 +; GFX9-NEXT: s_mov_b32 s1, 3 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12396,22 +12058,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 0x20001 +; GFX10-NEXT: s_mov_b32 s1, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 3 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12429,23 +12087,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0x20001 +; GFX11-NEXT: s_mov_b32 s1, 3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12463,22 +12117,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12500,22 +12150,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX9-NEXT: s_movk_i32 s5, 0x4400 +; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX9-NEXT: s_movk_i32 s1, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12533,22 +12179,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX10-NEXT: s_movk_i32 s1, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12566,23 +12208,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX11-NEXT: s_movk_i32 s1, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_movk_i32 s5, 0x4400 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12600,22 +12238,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX10-SCRATCH-NEXT: s_movk_i32 s1, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12637,21 +12271,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s34 +; GFX9-NEXT: s_mov_b32 s1, s35 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12669,21 +12302,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s34 +; GFX10-NEXT: s_mov_b32 s1, s35 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12701,22 +12333,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12734,21 +12362,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12771,22 +12395,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 0x20001 -; GFX9-NEXT: s_mov_b32 s5, 0x40003 +; GFX9-NEXT: s_mov_b32 s0, 0x20001 +; GFX9-NEXT: s_mov_b32 s1, 0x40003 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12804,22 +12424,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 0x20001 +; GFX10-NEXT: s_mov_b32 s1, 0x40003 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12837,23 +12453,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 0x20001 +; GFX11-NEXT: s_mov_b32 s1, 0x40003 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40003 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12871,22 +12483,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12908,19 +12516,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12938,19 +12544,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12968,20 +12572,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12999,19 +12601,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13034,21 +12634,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s34 +; GFX9-NEXT: s_mov_b32 s1, s35 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13066,21 +12665,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s34 +; GFX10-NEXT: s_mov_b32 s1, s35 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13098,22 +12696,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13131,21 +12725,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13168,22 +12758,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s1, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13201,22 +12787,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1 +; GFX10-NEXT: s_mov_b32 s1, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13234,23 +12816,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13268,22 +12846,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13305,25 +12879,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 5 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 3 -; GFX9-NEXT: s_mov_b32 s5, 4 -; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: s_mov_b32 s0, 3 +; GFX9-NEXT: s_mov_b32 s1, 4 +; GFX9-NEXT: s_mov_b32 s2, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 4 -; GFX9-NEXT: v_readlane_b32 s30, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 5 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13341,25 +12909,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 5 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 3 +; GFX10-NEXT: s_mov_b32 s1, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 5 -; GFX10-NEXT: v_writelane_b32 v40, s30, 3 -; GFX10-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 4 -; GFX10-NEXT: v_readlane_b32 s30, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 5 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13377,26 +12939,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 5 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 3 +; GFX11-NEXT: s_mov_b32 s1, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 5 -; GFX11-NEXT: v_writelane_b32 v40, s30, 3 -; GFX11-NEXT: v_writelane_b32 v40, s31, 4 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 4 -; GFX11-NEXT: v_readlane_b32 s30, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 5 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13414,25 +12970,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13454,28 +13004,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 3 -; GFX9-NEXT: s_mov_b32 s5, 4 -; GFX9-NEXT: s_mov_b32 s6, 5 -; GFX9-NEXT: s_mov_b32 s7, 6 +; GFX9-NEXT: s_mov_b32 s0, 3 +; GFX9-NEXT: s_mov_b32 s1, 4 +; GFX9-NEXT: s_mov_b32 s2, 5 +; GFX9-NEXT: s_mov_b32 s3, 6 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13493,28 +13035,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 3 +; GFX10-NEXT: s_mov_b32 s1, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s2, 5 +; GFX10-NEXT: s_mov_b32 s3, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 4 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 5 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13532,29 +13066,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 3 +; GFX11-NEXT: s_mov_b32 s1, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 5 +; GFX11-NEXT: s_mov_b32 s3, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 4 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 5 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 6 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13572,28 +13098,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13615,25 +13133,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13651,25 +13166,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13687,26 +13199,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13724,25 +13228,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13765,28 +13261,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13804,28 +13292,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1 +; GFX10-NEXT: s_mov_b32 s1, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s2, 3 +; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13843,29 +13323,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13883,28 +13355,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13926,31 +13390,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 7 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s30, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 6 -; GFX9-NEXT: v_readlane_b32 s30, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 7 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13965,34 +13421,26 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 7 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 5 -; GFX10-NEXT: v_writelane_b32 v40, s30, 5 -; GFX10-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, 1 +; GFX10-NEXT: s_mov_b32 s1, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s2, 3 +; GFX10-NEXT: s_mov_b32 s3, 4 +; GFX10-NEXT: s_mov_b32 s4, 5 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 6 -; GFX10-NEXT: v_readlane_b32 s30, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 7 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14010,32 +13458,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 7 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 -; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 5 -; GFX11-NEXT: v_writelane_b32 v40, s30, 5 -; GFX11-NEXT: v_writelane_b32 v40, s31, 6 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 6 -; GFX11-NEXT: v_readlane_b32 s30, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 -; GFX11-NEXT: v_readlane_b32 s7, v40, 3 -; GFX11-NEXT: v_readlane_b32 s6, v40, 2 -; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 7 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14053,31 +13493,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14099,35 +13531,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v40, s10, 6 -; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 8 +; GFX9-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, s40 +; GFX9-NEXT: s_mov_b32 s5, s41 +; GFX9-NEXT: s_mov_b32 s6, s42 +; GFX9-NEXT: s_mov_b32 s7, s43 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 -; GFX9-NEXT: v_readlane_b32 s30, v40, 8 -; GFX9-NEXT: v_readlane_b32 s11, v40, 7 -; GFX9-NEXT: v_readlane_b32 s10, v40, 6 -; GFX9-NEXT: v_readlane_b32 s9, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -14145,35 +13578,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: s_mov_b32 s5, s41 +; GFX10-NEXT: s_mov_b32 s6, s42 +; GFX10-NEXT: s_mov_b32 s7, s43 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14191,36 +13625,28 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: v_writelane_b32 v40, s9, 5 -; GFX11-NEXT: v_writelane_b32 v40, s10, 6 -; GFX11-NEXT: v_writelane_b32 v40, s11, 7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 8 -; GFX11-NEXT: v_writelane_b32 v40, s31, 9 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 -; GFX11-NEXT: v_readlane_b32 s30, v40, 8 -; GFX11-NEXT: v_readlane_b32 s11, v40, 7 -; GFX11-NEXT: v_readlane_b32 s10, v40, 6 -; GFX11-NEXT: v_readlane_b32 s9, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14238,35 +13664,27 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14290,40 +13708,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: v_writelane_b32 v40, s10, 6 -; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: v_writelane_b32 v40, s30, 8 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: s_mov_b32 s8, 5 -; GFX9-NEXT: s_mov_b32 s9, 6 -; GFX9-NEXT: s_mov_b32 s10, 7 -; GFX9-NEXT: s_mov_b32 s11, 8 +; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 5 +; GFX9-NEXT: s_mov_b32 s5, 6 +; GFX9-NEXT: s_mov_b32 s6, 7 +; GFX9-NEXT: s_mov_b32 s7, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 9 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 -; GFX9-NEXT: v_readlane_b32 s30, v40, 8 -; GFX9-NEXT: v_readlane_b32 s11, v40, 7 -; GFX9-NEXT: v_readlane_b32 s10, v40, 6 -; GFX9-NEXT: v_readlane_b32 s9, v40, 5 -; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -14341,40 +13751,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s0, 1 +; GFX10-NEXT: s_mov_b32 s1, 2 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s2, 3 +; GFX10-NEXT: s_mov_b32 s3, 4 +; GFX10-NEXT: s_mov_b32 s4, 5 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s5, 6 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: s_mov_b32 s6, 7 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-NEXT: s_mov_b32 s8, 5 -; GFX10-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-NEXT: s_mov_b32 s9, 6 -; GFX10-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-NEXT: s_mov_b32 s10, 7 -; GFX10-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-NEXT: s_mov_b32 s11, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-NEXT: s_mov_b32 s7, 8 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14392,41 +13794,33 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: s_mov_b32 s5, 6 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 7 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s8, 4 -; GFX11-NEXT: s_mov_b32 s8, 5 -; GFX11-NEXT: v_writelane_b32 v40, s9, 5 -; GFX11-NEXT: s_mov_b32 s9, 6 -; GFX11-NEXT: v_writelane_b32 v40, s10, 6 -; GFX11-NEXT: s_mov_b32 s10, 7 -; GFX11-NEXT: v_writelane_b32 v40, s11, 7 -; GFX11-NEXT: s_mov_b32 s11, 8 -; GFX11-NEXT: v_writelane_b32 v40, s30, 8 -; GFX11-NEXT: v_writelane_b32 v40, s31, 9 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s7, 8 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 -; GFX11-NEXT: v_readlane_b32 s30, v40, 8 -; GFX11-NEXT: v_readlane_b32 s11, v40, 7 -; GFX11-NEXT: v_readlane_b32 s10, v40, 6 -; GFX11-NEXT: v_readlane_b32 s9, v40, 5 -; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14444,40 +13838,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 6 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 7 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14499,38 +13885,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 18 +; GFX9-NEXT: v_writelane_b32 v40, s34, 14 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 -; GFX9-NEXT: v_writelane_b32 v40, s16, 12 -; GFX9-NEXT: v_writelane_b32 v40, s17, 13 -; GFX9-NEXT: v_writelane_b32 v40, s18, 14 -; GFX9-NEXT: v_writelane_b32 v40, s19, 15 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 16 +; GFX9-NEXT: v_writelane_b32 v40, s30, 12 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, s40 +; GFX9-NEXT: s_mov_b32 s5, s41 +; GFX9-NEXT: s_mov_b32 s6, s42 +; GFX9-NEXT: s_mov_b32 s7, s43 +; GFX9-NEXT: s_mov_b32 s8, s44 +; GFX9-NEXT: s_mov_b32 s9, s45 +; GFX9-NEXT: s_mov_b32 s10, s46 +; GFX9-NEXT: s_mov_b32 s11, s47 +; GFX9-NEXT: s_mov_b32 s12, s48 +; GFX9-NEXT: s_mov_b32 s13, s49 +; GFX9-NEXT: s_mov_b32 s14, s50 +; GFX9-NEXT: s_mov_b32 s15, s51 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: v_writelane_b32 v40, s31, 13 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 17 -; GFX9-NEXT: v_readlane_b32 s30, v40, 16 -; GFX9-NEXT: v_readlane_b32 s19, v40, 15 -; GFX9-NEXT: v_readlane_b32 s18, v40, 14 -; GFX9-NEXT: v_readlane_b32 s17, v40, 13 -; GFX9-NEXT: v_readlane_b32 s16, v40, 12 +; GFX9-NEXT: v_readlane_b32 s31, v40, 13 +; GFX9-NEXT: v_readlane_b32 s30, v40, 12 ; GFX9-NEXT: v_readlane_b32 s15, v40, 11 ; GFX9-NEXT: v_readlane_b32 s14, v40, 10 ; GFX9-NEXT: v_readlane_b32 s13, v40, 9 @@ -14543,7 +13938,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 18 +; GFX9-NEXT: v_readlane_b32 s34, v40, 14 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -14561,38 +13956,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 18 +; GFX10-NEXT: v_writelane_b32 v40, s34, 14 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: s_mov_b32 s5, s41 +; GFX10-NEXT: s_mov_b32 s6, s42 +; GFX10-NEXT: s_mov_b32 s7, s43 ; GFX10-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-NEXT: s_mov_b32 s8, s44 +; GFX10-NEXT: s_mov_b32 s9, s45 +; GFX10-NEXT: s_mov_b32 s10, s46 +; GFX10-NEXT: s_mov_b32 s11, s47 ; GFX10-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-NEXT: s_mov_b32 s12, s48 +; GFX10-NEXT: s_mov_b32 s13, s49 +; GFX10-NEXT: s_mov_b32 s14, s50 ; GFX10-NEXT: v_writelane_b32 v40, s15, 11 -; GFX10-NEXT: v_writelane_b32 v40, s16, 12 -; GFX10-NEXT: v_writelane_b32 v40, s17, 13 -; GFX10-NEXT: v_writelane_b32 v40, s18, 14 -; GFX10-NEXT: v_writelane_b32 v40, s19, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-NEXT: s_mov_b32 s15, s51 +; GFX10-NEXT: v_writelane_b32 v40, s30, 12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 13 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 17 -; GFX10-NEXT: v_readlane_b32 s30, v40, 16 -; GFX10-NEXT: v_readlane_b32 s19, v40, 15 -; GFX10-NEXT: v_readlane_b32 s18, v40, 14 -; GFX10-NEXT: v_readlane_b32 s17, v40, 13 -; GFX10-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-NEXT: v_readlane_b32 s31, v40, 13 +; GFX10-NEXT: v_readlane_b32 s30, v40, 12 ; GFX10-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-NEXT: v_readlane_b32 s13, v40, 9 @@ -14605,7 +14009,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 18 +; GFX10-NEXT: v_readlane_b32 s34, v40, 14 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14623,8 +14027,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 18 +; GFX11-NEXT: v_writelane_b32 v40, s0, 14 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14638,24 +14044,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s13, 9 ; GFX11-NEXT: v_writelane_b32 v40, s14, 10 ; GFX11-NEXT: v_writelane_b32 v40, s15, 11 -; GFX11-NEXT: v_writelane_b32 v40, s16, 12 -; GFX11-NEXT: v_writelane_b32 v40, s17, 13 -; GFX11-NEXT: v_writelane_b32 v40, s18, 14 -; GFX11-NEXT: v_writelane_b32 v40, s19, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 17 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 13 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 17 -; GFX11-NEXT: v_readlane_b32 s30, v40, 16 -; GFX11-NEXT: v_readlane_b32 s19, v40, 15 -; GFX11-NEXT: v_readlane_b32 s18, v40, 14 -; GFX11-NEXT: v_readlane_b32 s17, v40, 13 -; GFX11-NEXT: v_readlane_b32 s16, v40, 12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 13 +; GFX11-NEXT: v_readlane_b32 s30, v40, 12 ; GFX11-NEXT: v_readlane_b32 s15, v40, 11 ; GFX11-NEXT: v_readlane_b32 s14, v40, 10 ; GFX11-NEXT: v_readlane_b32 s13, v40, 9 @@ -14668,7 +14064,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 18 +; GFX11-NEXT: v_readlane_b32 s0, v40, 14 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14686,8 +14082,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 14 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14701,23 +14099,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 13 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 @@ -14730,7 +14118,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 14 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14771,49 +14159,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 +; GFX9-NEXT: v_writelane_b32 v40, s29, 25 +; GFX9-NEXT: v_writelane_b32 v40, s30, 26 +; GFX9-NEXT: v_writelane_b32 v40, s31, 27 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_inreg@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s46 -; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: v_mov_b32_e32 v1, s47 -; GFX9-NEXT: v_mov_b32_e32 v2, s48 -; GFX9-NEXT: v_mov_b32_e32 v3, s49 +; GFX9-NEXT: v_mov_b32_e32 v0, s30 +; GFX9-NEXT: v_mov_b32_e32 v1, s31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_mov_b32_e32 v0, s50 -; GFX9-NEXT: v_writelane_b32 v40, s30, 26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: v_mov_b32_e32 v0, s51 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s20, s36 -; GFX9-NEXT: s_mov_b32 s21, s37 -; GFX9-NEXT: s_mov_b32 s22, s38 -; GFX9-NEXT: s_mov_b32 s23, s39 -; GFX9-NEXT: s_mov_b32 s24, s40 -; GFX9-NEXT: s_mov_b32 s25, s41 -; GFX9-NEXT: s_mov_b32 s26, s42 -; GFX9-NEXT: s_mov_b32 s27, s43 -; GFX9-NEXT: s_mov_b32 s28, s44 -; GFX9-NEXT: s_mov_b32 s29, s45 -; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, s40 +; GFX9-NEXT: s_mov_b32 s5, s41 +; GFX9-NEXT: s_mov_b32 s6, s42 +; GFX9-NEXT: s_mov_b32 s7, s43 +; GFX9-NEXT: s_mov_b32 s8, s44 +; GFX9-NEXT: s_mov_b32 s9, s45 +; GFX9-NEXT: s_mov_b32 s10, s46 +; GFX9-NEXT: s_mov_b32 s11, s47 +; GFX9-NEXT: s_mov_b32 s12, s48 +; GFX9-NEXT: s_mov_b32 s13, s49 +; GFX9-NEXT: s_mov_b32 s14, s50 +; GFX9-NEXT: s_mov_b32 s15, s51 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -14879,47 +14265,46 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-NEXT: v_mov_b32_e32 v1, s47 -; GFX10-NEXT: v_mov_b32_e32 v2, s48 -; GFX10-NEXT: v_mov_b32_e32 v3, s49 -; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 -; GFX10-NEXT: s_mov_b32 s21, s37 -; GFX10-NEXT: s_mov_b32 s22, s38 -; GFX10-NEXT: s_mov_b32 s23, s39 -; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 -; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: v_mov_b32_e32 v4, s50 -; GFX10-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-NEXT: s_mov_b32 s26, s42 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 -; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 -; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 -; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s30 +; GFX10-NEXT: v_mov_b32_e32 v1, s31 +; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: s_mov_b32 s5, s41 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_mov_b32 s6, s42 +; GFX10-NEXT: s_mov_b32 s7, s43 +; GFX10-NEXT: s_mov_b32 s8, s44 +; GFX10-NEXT: s_mov_b32 s9, s45 +; GFX10-NEXT: s_mov_b32 s10, s46 +; GFX10-NEXT: s_mov_b32 s11, s47 +; GFX10-NEXT: s_mov_b32 s12, s48 +; GFX10-NEXT: s_mov_b32 s13, s49 +; GFX10-NEXT: s_mov_b32 s14, s50 +; GFX10-NEXT: s_mov_b32 s15, s51 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -14970,8 +14355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s2, s32, 16 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -14988,42 +14373,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 -; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47 -; GFX11-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 -; GFX11-NEXT: s_mov_b32 s20, s36 -; GFX11-NEXT: s_mov_b32 s21, s37 -; GFX11-NEXT: s_mov_b32 s22, s38 -; GFX11-NEXT: s_mov_b32 s23, s39 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 -; GFX11-NEXT: s_mov_b32 s24, s40 -; GFX11-NEXT: s_mov_b32 s25, s41 -; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 -; GFX11-NEXT: s_mov_b32 s26, s42 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 -; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 -; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 -; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -15071,9 +14440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -15090,44 +14458,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 -; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 -; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 -; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 -; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 -; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -15196,55 +14549,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s52 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 -; GFX9-NEXT: v_mov_b32_e32 v1, s47 -; GFX9-NEXT: v_mov_b32_e32 v2, s48 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s49 ; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: v_mov_b32_e32 v0, s51 -; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s20, s36 -; GFX9-NEXT: s_mov_b32 s21, s37 -; GFX9-NEXT: s_mov_b32 s22, s38 -; GFX9-NEXT: s_mov_b32 s23, s39 -; GFX9-NEXT: s_mov_b32 s24, s40 -; GFX9-NEXT: s_mov_b32 s25, s41 -; GFX9-NEXT: s_mov_b32 s26, s42 -; GFX9-NEXT: s_mov_b32 s27, s43 -; GFX9-NEXT: s_mov_b32 s28, s44 -; GFX9-NEXT: s_mov_b32 s29, s45 ; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s30 +; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX9-NEXT: v_mov_b32_e32 v1, s31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s0, s36 +; GFX9-NEXT: s_mov_b32 s1, s37 +; GFX9-NEXT: s_mov_b32 s2, s38 +; GFX9-NEXT: s_mov_b32 s3, s39 +; GFX9-NEXT: s_mov_b32 s4, s40 +; GFX9-NEXT: s_mov_b32 s5, s41 +; GFX9-NEXT: s_mov_b32 s6, s42 +; GFX9-NEXT: s_mov_b32 s7, s43 +; GFX9-NEXT: s_mov_b32 s8, s44 +; GFX9-NEXT: s_mov_b32 s9, s45 +; GFX9-NEXT: s_mov_b32 s10, s46 +; GFX9-NEXT: s_mov_b32 s11, s47 +; GFX9-NEXT: s_mov_b32 s12, s48 +; GFX9-NEXT: s_mov_b32 s13, s49 +; GFX9-NEXT: s_mov_b32 s14, s50 +; GFX9-NEXT: s_mov_b32 s15, s51 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -15310,52 +14661,51 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s52 -; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_mov_b32_e32 v0, s46 -; GFX10-NEXT: v_mov_b32_e32 v2, s48 -; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 -; GFX10-NEXT: s_mov_b32 s20, s36 -; GFX10-NEXT: s_mov_b32 s21, s37 -; GFX10-NEXT: s_mov_b32 s22, s38 -; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 -; GFX10-NEXT: s_mov_b32 s24, s40 -; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: v_mov_b32_e32 v4, s50 -; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-NEXT: s_mov_b32 s26, s42 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 -; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 -; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 -; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s52 +; GFX10-NEXT: v_mov_b32_e32 v1, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s31 +; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_mov_b32 s5, s41 +; GFX10-NEXT: s_mov_b32 s6, s42 +; GFX10-NEXT: s_mov_b32 s7, s43 +; GFX10-NEXT: s_mov_b32 s8, s44 +; GFX10-NEXT: s_mov_b32 s9, s45 +; GFX10-NEXT: s_mov_b32 s10, s46 +; GFX10-NEXT: s_mov_b32 s11, s47 +; GFX10-NEXT: s_mov_b32 s12, s48 +; GFX10-NEXT: s_mov_b32 s13, s49 +; GFX10-NEXT: s_mov_b32 s14, s50 +; GFX10-NEXT: s_mov_b32 s15, s51 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s0, s36 +; GFX10-NEXT: s_mov_b32 s1, s37 +; GFX10-NEXT: s_mov_b32 s2, s38 +; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -15406,8 +14756,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s3, s32, 16 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_add_i32 s36, s32, 8 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -15424,46 +14774,30 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 -; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47 -; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 -; GFX11-NEXT: v_mov_b32_e32 v2, s48 -; GFX11-NEXT: s_add_i32 s2, s32, 24 -; GFX11-NEXT: s_mov_b32 s20, s36 -; GFX11-NEXT: s_mov_b32 s21, s37 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 -; GFX11-NEXT: s_mov_b32 s22, s38 -; GFX11-NEXT: s_mov_b32 s23, s39 -; GFX11-NEXT: s_mov_b32 s24, s40 -; GFX11-NEXT: s_mov_b32 s25, s41 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 -; GFX11-NEXT: s_mov_b32 s26, s42 -; GFX11-NEXT: scratch_store_b32 off, v6, s2 -; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 -; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 -; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 -; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v1, s31 +; GFX11-NEXT: v_mov_b32_e32 v0, s30 +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX11-NEXT: scratch_store_b32 off, v2, s36 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -15511,13 +14845,17 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dword s36, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s36 +; GFX10-SCRATCH-NEXT: s_add_i32 s36, s32, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 @@ -15530,50 +14868,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_clause 0x2 -; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: ; meta instruction -; GFX10-SCRATCH-NEXT: ; meta instruction -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 -; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 -; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 -; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 -; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 -; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s36 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -17397,6 +16714,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17423,6 +16741,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -17442,18 +16761,18 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-LABEL: test_call_external_void_func_bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_writelane_b32 v40, s1, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17469,19 +16788,19 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17511,6 +16830,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17537,6 +16857,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -17556,18 +16877,18 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_writelane_b32 v40, s1, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17583,19 +16904,19 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17625,6 +16946,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17651,6 +16973,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -17670,18 +16993,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_writelane_b32 v40, s1, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17697,19 +17020,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17739,6 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17765,8 +17090,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17784,18 +17111,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s3, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_writelane_b32 v40, s2, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17811,19 +17138,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17853,6 +17180,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17879,8 +17208,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17898,18 +17229,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s3, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_writelane_b32 v40, s2, 2 +; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17925,19 +17256,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17967,6 +17298,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s3, s7 +; GFX9-NEXT: s_mov_b32 s2, s6 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17993,8 +17328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_mov_b32 s3, s7 +; GFX10-NEXT: s_mov_b32 s2, s6 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -18012,18 +17351,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s34, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s35, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s35 +; GFX11-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -18039,19 +17378,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -18077,16 +17416,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX9-NEXT: s_mov_b32 s3, s7 +; GFX9-NEXT: s_mov_b32 s2, s6 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, s8 +; GFX9-NEXT: s_mov_b32 s5, s9 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -18104,16 +17459,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX10-NEXT: s_mov_b32 s3, s7 +; GFX10-NEXT: s_mov_b32 s2, s6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, s10 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, s11 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -18126,18 +17497,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s34, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_or_saveexec_b32 s35, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s35 +; GFX11-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -18153,19 +17524,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b95..25c6840 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -847,11 +847,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -885,19 +885,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_movk_i32 s0, 0x7b +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -930,22 +930,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s5, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -979,19 +979,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b +; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GISEL-NEXT: v_readfirstlane_b32 s7, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GISEL-NEXT: s_movk_i32 s0, 0x7b +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1024,11 +1024,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s5 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll index 0139c52..27e9d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll @@ -10,7 +10,7 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3 ; CHECK-NEXT: scratch_load_b32 v2, v0, off ; CHECK-NEXT: scratch_load_b32 v3, v3, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[4:7], 0 offen +; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] %x0 = load i32, ptr addrspace(5) %src diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll index cdaac14..ec1de02 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll @@ -6,7 +6,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GCN-LABEL: sink_scratch_pointer: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: scratch_load_b32 v0, v0, off offset:-4 @@ -21,7 +21,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GISEL-LABEL: sink_scratch_pointer: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-NEXT: s_cbranch_scc0 .LBB0_2 ; GISEL-NEXT: ; %bb.1: ; %bb2 ; GISEL-NEXT: scratch_load_b32 v0, v0, off offset:-4 -- cgit v1.1 From 7564566779eb07e9daf41a351b09cf7607871845 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 20 Mar 2024 11:58:46 -0400 Subject: Reapply "Move assertion for AdjustsStack from PEI to MachineVerifier (#85698)" - The check is now actually done in both PEI and the MachineVerifier. - More .mir tests trivially updated with "adjustsStack: true" as needed. --- llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir | 2 ++ llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir | 1 + llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir | 1 + llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir | 2 ++ llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir | 1 + llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir | 1 + llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir | 1 + 7 files changed, 9 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir index 3616d61..5ef8a94 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir @@ -8,6 +8,8 @@ --- name: restore_undef_copy_use tracksRegLiveness: true +frameInfo: + adjustsStack: true machineFunctionInfo: maxKernArgAlign: 1 isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index bdd89a9..dde84af 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -13,6 +13,7 @@ name: greedy_fail_alloc_sgpr1024_spill tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: explicitKernArgSize: 16 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 2ccc241..fdfc9b0 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -24,6 +24,7 @@ registers: - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' } frameInfo: maxAlignment: 1 + adjustsStack: true hasCalls: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index c0d1999..0903770 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -181,6 +181,8 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true +frameInfo: + adjustsStack: true liveins: - { reg: '$vgpr0', virtual-reg: '%0' } - { reg: '$vgpr1', virtual-reg: '%1' } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index efbdbca..c6ccbd9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -78,6 +78,7 @@ name: sgpr_spill_wrong_stack_id tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index 3558298..f8ec6bb 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -21,6 +21,7 @@ name: kernel tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 3d9db68..6659e95 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -20,6 +20,7 @@ name: undef_identity_copy tracksRegLiveness: true frameInfo: maxAlignment: 4 + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true -- cgit v1.1 From a2dfc9ac7da23ccf0077081c8825a23aed1df0c0 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Fri, 22 Mar 2024 08:49:29 +0800 Subject: [NewPM][AMDGPU] Add AMDGPUPassRegistry.def (#86095) Move the pass registry to a separate file, prepare for porting dag-isel. --- llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index 538ef42..b7a91f6 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s ; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. ; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index fab24e1..86e3d93 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value( diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index cc7a45c..e70d734 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck --check-prefixes=IR,IR-DPP %s ; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. ; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction -- cgit v1.1 From e1a8120a63cdb6c9567b0f68d9a0390e4f5da184 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Fri, 22 Mar 2024 09:25:06 +0530 Subject: [AMDGPU] Support double type in atomic optimizer. (#84307) Presently the atomic optimizer supports only 32-bit operations. Plan is to extend the atomic optimizer for 64-bit operations for compute and graphics. This patch extends support for double type for `uniform values` only. Going forward, will extend the support for divergent values. Adding support for divergent values requires extending/legalizing readfirstlane, readlane, writelane, etc ops for 64-bit operations to avoid `bitcast` noise that we have currently. --------- Authored-by: Pravin Jagtap --- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 274 +- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 270 +- .../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 560 ++ .../AMDGPU/global_atomics_optimizer_fp_no_rtn.ll | 420 ++ .../CodeGen/AMDGPU/global_atomics_scan_fadd.ll | 5578 ++++++++++++++++++++ .../CodeGen/AMDGPU/global_atomics_scan_fmax.ll | 3960 ++++++++++++++ .../CodeGen/AMDGPU/global_atomics_scan_fmin.ll | 3960 ++++++++++++++ .../CodeGen/AMDGPU/global_atomics_scan_fsub.ll | 5576 +++++++++++++++++++ 8 files changed, 20490 insertions(+), 108 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 255c6de..1a76f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1090,18 +1090,29 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB39_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1109,20 +1120,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 +; GFX90A-NEXT: .LBB39_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -1132,26 +1154,47 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1161,18 +1204,29 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB41_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1180,20 +1234,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB41_2 +; GFX90A-NEXT: .LBB41_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1203,26 +1268,47 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB42_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB42_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1394,37 +1480,59 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB49_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 +; GFX90A-NEXT: .LBB49_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB49_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB49_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1866,23 +1974,44 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB65_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB65_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -1892,23 +2021,44 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB66_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB66_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB66_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB66_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -1918,44 +2068,66 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB67_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: ds_read_b64 v[2:3], v4 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 -; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB67_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB67_2 +; GFX90A-NEXT: .LBB67_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB67_3 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: ds_read_b64 v[0:1], v2 +; GFX940-NEXT: v_mov_b32_e32 v4, s0 +; GFX940-NEXT: ds_read_b64 v[2:3], v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB67_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_cbranch_execnz .LBB67_2 +; GFX940-NEXT: .LBB67_3: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 767d347..a948fab 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1181,18 +1181,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB42_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1200,20 +1210,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB42_2 +; GFX90A-NEXT: .LBB42_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -1223,26 +1243,45 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB43_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB43_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB43_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB43_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1252,18 +1291,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB44_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1271,20 +1320,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB44_2 +; GFX90A-NEXT: .LBB44_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB44_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB44_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1294,26 +1353,45 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB45_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB45_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB45_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB45_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1485,37 +1563,57 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB52_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB52_2 +; GFX90A-NEXT: .LBB52_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB52_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB52_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -2020,23 +2118,42 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB70_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB70_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB70_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB70_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2046,23 +2163,42 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB71_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB71_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB71_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB71_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2072,46 +2208,66 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB72_3 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: ds_read_b64 v[2:3], v0 +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB72_2 +; GFX90A-NEXT: .LBB72_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB72_3 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: ds_read_b64 v[2:3], v0 +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB72_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_cbranch_execnz .LBB72_2 +; GFX940-NEXT: .LBB72_3: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index b7a91f6..b717280 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1058,6 +1058,566 @@ define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_st ret float %result } +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double +; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) +; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) +; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP28:%.*]] = fmul double [[VAL]], [[TMP27]] +; IR-NEXT: [[TMP29:%.*]] = fadd double [[TMP26]], [[TMP28]] +; IR-NEXT: br label [[TMP30]] +; IR: 30: +; IR-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) +; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-NEXT: [[TMP23:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]] +; IR-NEXT: [[TMP25:%.*]] = call double @llvm.minnum.f64(double [[TMP22]], double [[TMP24]]) +; IR-NEXT: br label [[TMP26]] +; IR: 26: +; IR-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-NEXT: ret double [[TMP27]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP26]] +; IR-ITERATIVE: 26: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP27]] +; +; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-DPP: 10: +; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP12]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-DPP-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP26]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-DPP-NEXT: ret double [[TMP27]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index e70d734..b9234f4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -864,6 +864,426 @@ define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_str ret void } +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double +; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: br label [[TMP17]] +; IR: 17: +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: br label [[TMP13]] +; IR: 13: +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-DPP: 10: +; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP12]] +; IR-DPP: 12: +; IR-DPP-NEXT: br label [[TMP13]] +; IR-DPP: 13: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 96c615b..4f00d48 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -5408,6 +5409,5583 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: .LBB9_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: .LBB9_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: .LBB9_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: .LBB11_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: .LBB11_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: .LBB11_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: .LBB11_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: .LBB11_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: .LBB16_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: .LBB16_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: .LBB16_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: .LBB16_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: .LBB16_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: .LBB16_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 3cc5a4c..622be43 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare float @div.double.value() define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: @@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: .LBB6_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: .LBB6_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: .LBB8_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: .LBB8_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: .LBB8_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: .LBB8_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: .LBB10_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: .LBB10_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: .LBB10_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: .LBB10_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 314c52a..49d415c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare float @div.double.value() define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: @@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: .LBB6_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: .LBB6_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: .LBB8_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: .LBB8_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: .LBB8_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: .LBB8_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: .LBB10_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: .LBB10_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: .LBB10_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: .LBB10_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index bc9125e..7a7ddbe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: @@ -5616,6 +5617,5581 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: .LBB9_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: .LBB9_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: .LBB9_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: .LBB11_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: .LBB11_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: .LBB11_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: .LBB11_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: .LBB11_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: .LBB16_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: .LBB16_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: .LBB16_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: .LBB16_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: .LBB16_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: .LBB16_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp} -- cgit v1.1 From d365a45cb3eaa640b09874fb7984a6a69683c773 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Sat, 23 Mar 2024 15:12:44 +0300 Subject: [GlobalISel] Introduce G_TRAP, G_DEBUGTRAP, G_UBSANTRAP (#84941) Here we introduce three new GMIR instructions to cover a set of trap intrinsics. The idea behind it is that generic intrinsics shouldn't be used with G_INTRINSIC opcode. These new instructions can match perfectly with existing trap ISD nodes. It allows X86, AArch64, RISCV and Mips to reuse SelectionDAG patterns for selection and avoid manual selection. However AMDGPU is an exception. It selects traps during legalization regardless SelectionDAG or GlobalISel. Since there are not many places where traps are used, this change attempts to clean up all the usages of G_INTRINSIC with trap intrinsics. So, there is no stage when both G_TRAP and G_INTRINSIC_W_SIDE_EFFECTS(@llvm.trap) are allowed. --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir index b4bc648..305eca7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir @@ -24,7 +24,7 @@ body: | bb.0: %0:_(s8) = G_CONSTANT i8 0 %1:_(p1) = G_CONSTANT i64 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_TRAP bb.1: G_STORE %0, %1 :: (store 1, addrspace 1) @@ -55,7 +55,7 @@ body: | ; GCN-NEXT: S_ENDPGM 0 bb.0: %0:_(s8) = G_CONSTANT i8 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_TRAP %1:_(p1) = G_CONSTANT i64 0 bb.1: -- cgit v1.1 From babbdad15b8049a6a78087d15a163d897f07d320 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Mon, 25 Mar 2024 09:23:40 +0100 Subject: [AMDGPU] Handle non-register operands for S_SUB/ADD_U64_PSEUDO (#86104) This pseudo uses SSrc_b64 so it allows both an immediate or a register, but the lowering crashed on immediate operands. --- llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir | 68 ++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir new file mode 100644 index 0000000..cba114c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir @@ -0,0 +1,68 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX12 %s + +--- +name: reg_ops +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: reg_ops + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY2]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], [[COPY3]], implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: reg_ops + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], [[DEF1]] + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = IMPLICIT_DEF + %2:sreg_64 = S_ADD_U64_PSEUDO %0, %1, implicit-def $scc +... + +--- +name: lhs_imm +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: lhs_imm + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 6565, [[COPY]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 0, [[COPY1]], implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: lhs_imm + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 6565, [[DEF]] + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = S_ADD_U64_PSEUDO 6565, %0, implicit-def $scc +... + +--- +name: rhs_imm +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: rhs_imm + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 6565, implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], 0, implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: rhs_imm + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], 6565 + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = S_ADD_U64_PSEUDO %0, 6565, implicit-def $scc +... -- cgit v1.1 From 75e528fdd9594ecb6fdb5d9e7bee1506f7e43be0 Mon Sep 17 00:00:00 2001 From: David Stuttard Date: Mon, 25 Mar 2024 09:01:46 +0000 Subject: [AMDGPU] Extend zero initialization of return values for TFE (#85759) buffer_load instructions that use TFE also need to zero initialize return values similar to how the image instructions currently work. Add support for this with standard zero init of all results + zero init of just TFE flag when enable-prt-strict-null subtarget feature is disabled. --- .../llvm.amdgcn.struct.buffer.load.format.ll | 1 + .../llvm.amdgcn.struct.ptr.buffer.load.format.ll | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll | 111 ++++++-- .../llvm.amdgcn.struct.buffer.load.format.ll | 317 ++++++++++++++++++++- .../llvm.amdgcn.struct.ptr.buffer.load.format.ll | 280 ++++++++++++++++++ 5 files changed, 679 insertions(+), 31 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 686b849..06bd45a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 9edc2455..1e3f94a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 1348315..7b1f55e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -22,18 +22,36 @@ main_body: define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_both: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x00,0x00,0x60,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_both: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x05] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v9, v8 :: v_dual_mov_b32 v10, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0a,0x09] +; GFX12-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v12, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0c,0x0b] +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) @@ -63,18 +81,37 @@ main_body: define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05] +; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a] +; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c] +; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -155,18 +192,31 @@ main_body: define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03] +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x03] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 ; encoding: [0x06,0x01,0x10,0xca,0x06,0x01,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -196,18 +246,31 @@ main_body: define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9d,0x01,0x62,0xf0,0x06,0x00,0x20,0x00,0x05,0x04,0x03,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x07,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 ; encoding: [0x07,0x01,0x10,0xca,0x07,0x01,0x08,0x08] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll index 00be32b..ba3d306 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s ;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s @@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs_large: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0 @@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen @@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen @@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both_reversed: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v2, v0 @@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x_i32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_xy: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -644,7 +782,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX6-NEXT: v_mov_b32_e32 v7, 2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s0, s2 @@ -658,7 +801,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 -; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX8PLUS-NEXT: v_mov_b32_e32 v7, 2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 +; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8PLUS-NEXT: v_mov_b32_e32 v0, v6 @@ -667,22 +815,40 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v2, 2 +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4i32_tfe: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 +; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: ; return to shader part epilog - %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 2, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <4 x i32>, i32 } %load, 1 @@ -694,6 +860,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -708,6 +878,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -718,15 +892,32 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 ; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -744,6 +935,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -759,6 +953,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -769,15 +966,31 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -795,6 +1008,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -810,6 +1026,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -820,15 +1039,31 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -846,6 +1081,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -860,6 +1098,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -870,15 +1110,29 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -896,6 +1150,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -910,6 +1167,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -920,15 +1179,29 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -946,6 +1219,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -960,6 +1234,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -970,15 +1245,28 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -996,6 +1284,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1010,6 +1299,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -1020,15 +1310,28 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll index b0bd4e4..c5202b8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load: @@ -31,6 +32,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrsp ; GFX11-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -62,6 +73,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret <4 x float> %data @@ -126,6 +144,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) ; GFX11-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog main_body: %d.0 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0) %d.1 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0) @@ -156,6 +193,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0) ret <4 x float> %data @@ -188,6 +232,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0) ret <4 x float> %data @@ -220,6 +273,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0) ret <4 x float> %data @@ -252,6 +314,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0) ret <4 x float> %data @@ -284,6 +355,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0) ret <4 x float> %data @@ -307,6 +387,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data @@ -339,6 +425,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -371,6 +466,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) @@ -395,6 +499,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data @@ -421,6 +531,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -447,6 +564,13 @@ define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %data @@ -473,6 +597,13 @@ define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call i32 @llvm.amdgcn.struct.ptr.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i32 %data to float @@ -500,6 +631,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret <2 x float> %data @@ -509,6 +647,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -523,6 +665,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -533,11 +679,25 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out @@ -550,6 +710,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -564,6 +728,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -574,11 +742,25 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %load, 0 store <4 x float> %data, ptr addrspace(1) %out @@ -591,6 +773,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -606,6 +791,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -616,11 +804,24 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %load, 0 store <3 x i32> %data, ptr addrspace(1) %out @@ -633,6 +834,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -648,6 +852,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -658,11 +865,24 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %load, 0 store <3 x float> %data, ptr addrspace(1) %out @@ -675,6 +895,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -689,6 +912,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -699,11 +924,23 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %load, 0 store <2 x i32> %data, ptr addrspace(1) %out @@ -716,6 +953,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -730,6 +970,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -740,11 +982,23 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %load, 0 store <2 x float> %data, ptr addrspace(1) %out @@ -757,6 +1011,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -771,6 +1026,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -781,11 +1037,22 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %load, 0 store i32 %data, ptr addrspace(1) %out @@ -798,6 +1065,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -812,6 +1080,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -822,11 +1091,22 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { float, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { float, i32 } %load, 0 store float %data, ptr addrspace(1) %out -- cgit v1.1 From 06cfbe3cfd44cd2ca9eb970b8c0e5f4911468440 Mon Sep 17 00:00:00 2001 From: David Stuttard Date: Mon, 25 Mar 2024 14:44:22 +0000 Subject: [AMDPU] Add support for idxen and bothen buffer load/store merging in SILoadStoreOptimizer (#86285) Added more buffer instruction merging support --- llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir | 1154 +++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/merge-buffer.mir | 1130 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir | 28 + 3 files changed, 2312 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir create mode 100644 llvm/test/CodeGen/AMDGPU/merge-buffer.mir (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir new file mode 100644 index 0000000..d7f5d1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir @@ -0,0 +1,1154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GFX12 %s + +--- +name: buffer_load_dword_dwordx3 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx3_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx3_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + + +name: buffer_load_dword_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_32 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_32 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %10:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %11:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %12:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %13:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %14:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %15:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +# +# buffer_store_dword +# + +name: buffer_store_dword_xyz +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_xyz + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx3_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx3_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx2_dwordx2 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_dwordx2 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx2_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_32 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX12-LABEL: name: buffer_store_dword_32 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %12:vgpr_32 = COPY $vgpr8 + %11:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %9:vgpr_32 = COPY $vgpr5 + %8:vgpr_32 = COPY $vgpr4 + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %5:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %8:vgpr_32, %13:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %9:vgpr_32, %13:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %10:vgpr_32, %13:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %11:vgpr_32, %13:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %12:vgpr_32, %13:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_1 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_1 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzle +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzle + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merge_across_swizzled_store +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merge_across_swizzled_store + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzled_store +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzled_store + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact +body: | + bb.0.entry: + + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen_exact +body: | + bb.0.entry: + + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:vreg_64 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vreg_64 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:vgpr_32 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vgpr_32 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir new file mode 100644 index 0000000..1c6d429 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir @@ -0,0 +1,1130 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s + +--- +name: buffer_load_dword_dwordx3 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx3_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx3_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + + +name: buffer_load_dword_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_32 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_32 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2 + ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0 + ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %11:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %12:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %13:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %14:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +# +# buffer_store_dword +# + +name: buffer_store_dword_xyz +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_xyz + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx3_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx3_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx2_dwordx2 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_dwordx2 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx2_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_32 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GCN-LABEL: name: buffer_store_dword_32 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %12:vgpr_32 = COPY $vgpr8 + %11:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %9:vgpr_32 = COPY $vgpr5 + %8:vgpr_32 = COPY $vgpr4 + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_1 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_1 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzle +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzle + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzled_store +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzled_store + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %5:sgpr_128, 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact +body: | + bb.0.entry: + + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen_exact +body: | + bb.0.entry: + + ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:vreg_64 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vreg_64 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:vgpr_32 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vgpr_32 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir index c86b5ad..9766b42 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -7,9 +7,37 @@ # GFX9 tests # +--- name: gfx9_tbuffer_load_x_xyz body: | bb.0.entry: + ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3 + ; + ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; + ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) %0:sgpr_32 = COPY $sgpr0 %1:sgpr_32 = COPY $sgpr1 %2:sgpr_32 = COPY $sgpr2 -- cgit v1.1 From b7611370491873722e08e4ce9374312d0c936af1 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 25 Mar 2024 13:11:58 -0700 Subject: [AMDGPU] Use correct VGPR threshold for flagging ExcessRP regions in unified register file case (#85860) `ST.getMaxNumVGPRs(MF)` lowers to `AMDGPUBaseInfo.cpp:getTotalNumVGPRs` which returns 512 for gfx90a. This is subsequently limited by `AMDGPUBaseInfo:getAddressableNumVGPRs()`, which also returns 512 for gfx90a. The ISA states we can have a total of 512 registers, but a maximum of only 256 of each of AGPR and VGPR (gfx90a 3.6.4). Therefore, in unified register file case, `ST.getMaxNumVGPRs(MF)` calculates the maximum number of combined VGPR + AGPR. But, it is currently used as the limit for accvgpr and as the limit for archvgpr. This patch uses it as the combined limit, and accounts for the maximum addressable arch/acc VGPRs when calculating the per RegClass limits. It is not unreasonable to think other clients of getTotalNumVGPRs are using it in the wrong way. --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir | 2 ++ 1 file changed, 2 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir index 091b29c..e93595b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -4,6 +4,8 @@ --- | define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void } + attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } + !0 = distinct !{!0} !1 = !{!1, !0} ... -- cgit v1.1 From 350bda4419e15e5d68a87667988458546fa2e0c2 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Mon, 25 Mar 2024 16:55:22 -0700 Subject: AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (#86313) Rename the intrinsics to close to the instruction mnemonic names: Use global_load_tr_b64 and global_load_tr_b128 instead of global_load_tr. This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast. --- .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 146 ++++----------------- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 146 ++++----------------- 2 files changed, 58 insertions(+), 234 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index b4415c1..f6197e0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1)) -declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1)) -declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1)) -declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b64: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b64: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep) + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep) + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep) - store <8 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep) - store <8 x bfloat> %val, ptr addrspace(1) %use - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 7ad1416..a2dc366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1)) -declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1)) -declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1)) -declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b64: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b64: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep) + %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep) store i32 %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep) + %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep) store <4 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep) - store <4 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep) - store <4 x bfloat> %val, ptr addrspace(1) %use - ret void -} -- cgit v1.1 From 14c30189fb8782535ac9a5a52160e3fc62e7e78c Mon Sep 17 00:00:00 2001 From: Bevin Hansson <59652494+bevin-hansson@users.noreply.github.com> Date: Tue, 26 Mar 2024 10:08:22 +0100 Subject: [ExpandLargeFpConvert] Fix incorrect values in fp-to-int conversion. (#86514) The IR for a double-to-i129 conversion looks like this in one of the blocks in compiler-rt: %cmp5.i = icmp ult i16 %3, -129, !dbg !24 But in ExpandLargeFpConvert, it looks like: %13 = icmp ult i129 %12, 4294967167, !dbg !19 ExpandLargeFpConvert is wrong; the value should have been signed before negating, but instead we get a very large unsigned value. Another value in the same pass also has this issue. --- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 400 ++++++++++++++++----------------- 1 file changed, 196 insertions(+), 204 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index a69418d..66bf0d5 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -22,35 +22,31 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB0_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB0_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -59,37 +55,37 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] ; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 ; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -100,37 +96,37 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4] -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB0_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -158,14 +154,16 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 ; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc -; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, -1 ; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] ; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -396,35 +394,31 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB1_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB1_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -433,37 +427,37 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] ; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 ; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -474,37 +468,37 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4] -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB1_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -532,14 +526,16 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 ; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc -; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, -1 ; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] ; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -769,36 +765,32 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB2_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB2_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -807,37 +799,37 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] ; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 ; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v1 -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -847,29 +839,29 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -897,14 +889,16 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 ; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc -; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, -1 ; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] ; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -1129,36 +1123,32 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0xffffff7f -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB3_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc -; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB3_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -1167,37 +1157,37 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] ; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 ; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v5, v1 -; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -1207,29 +1197,29 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1257,14 +1247,16 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 ; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc -; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, -1 ; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] ; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -- cgit v1.1 From 4d315ff382de912e5129b417c997116851088d4b Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 26 Mar 2024 09:11:35 +0000 Subject: [GlobalISel] Add CTLZ known bits. (#86436) Replicated from SDAG. --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir index e9f8180..fed277d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir @@ -64,9 +64,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[CTLZ_ZERO_UNDEF]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_CTLZ_ZERO_UNDEF %0 %2:_(s32) = G_ZEXT %1 -- cgit v1.1 From 256343a0e919bc09f65a8ee26751b561fa2dbfc1 Mon Sep 17 00:00:00 2001 From: Thomas Symalla <5754458+tsymalla@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:01:08 +0100 Subject: Revert "Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226" (#86273) Reverts llvm/llvm-project#81394 This reverts commit 3ac243bc0d7922d083af2cf025247b5698556062. It is not handling RSrc registers s0-s3 correctly. This leads to a broken test, where it expects s0-s3 as function argument and uses it as RSrc register as well. We need to re-visit the patch, but apparently we only want to have s0-s3 as argument registers if we don't need them as RSrc registers. --- .../GlobalISel/irtranslator-call-non-fixed.ll | 10 +- .../CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 2 +- .../test/CodeGen/AMDGPU/combine_andor_with_cmps.ll | 24 +- llvm/test/CodeGen/AMDGPU/function-args-inreg.ll | 133 - .../CodeGen/AMDGPU/gfx-callable-argument-types.ll | 4377 +++++++++++--------- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 60 +- llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll | 2 +- llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll | 4 +- 9 files changed, 2559 insertions(+), 2063 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll index fad833c0..5effd24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 7567060..392b0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index e369f7e..9865883 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX11-LABEL: test_inreg_arg_store: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] store bfloat %in, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index e1e3220..10d71a3 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test34: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s0, s1 +; GCN-NEXT: s_min_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test35: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test36: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_min_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_lt_u32 s0, s2 +; GCN-NEXT: s_cmp_lt_u32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test37: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_i32 s0, s2 +; GCN-NEXT: s_cmp_ge_i32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test38: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_u32 s0, s0, s1 +; GCN-NEXT: s_max_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test39: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s0, s1 +; GCN-NEXT: s_min_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test40: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_le_i32 s0, s2 +; GCN-NEXT: s_cmp_le_i32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test41: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_min_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_u32 s0, s2 +; GCN-NEXT: s_cmp_ge_u32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 27845b6..44a9127 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -2176,93 +2176,6 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) declare void @extern() define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a13i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s27, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[28:29] -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: v_writelane_b32 v40, s27, 2 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a13i32_inreg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s23, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s24, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s24 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: s_getpc_b64 s[18:19] -; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 -; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s23, 2 -; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] store [13 x i32] %arg0, ptr addrspace(1) %ptr call void @extern() ret void @@ -2290,52 +2203,6 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; FIXME: Should still fail define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12 -; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10 -; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8 -; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 -; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off -; GFX11-NEXT: s_setpc_b64 s[30:31] store [16 x i32] %arg0, ptr addrspace(1) %ptr ret void } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 3e1db5f..a118fa3 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9567,17 +9567,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9595,17 +9597,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9623,18 +9627,20 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9652,17 +9658,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9684,17 +9692,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9712,17 +9722,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9740,18 +9752,20 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9769,17 +9783,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9801,17 +9817,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 42 +; GFX9-NEXT: s_mov_b32 s4, 42 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9829,17 +9847,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9857,18 +9877,20 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 42 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9886,17 +9908,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9918,18 +9942,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9947,18 +9975,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9976,19 +10008,23 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10006,18 +10042,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10039,25 +10079,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10073,23 +10116,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10107,19 +10153,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10137,18 +10191,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10171,20 +10233,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10202,20 +10272,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10233,21 +10311,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10265,20 +10351,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10300,29 +10394,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 +; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10340,29 +10437,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10380,25 +10480,33 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10416,24 +10524,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10458,35 +10574,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: s_mov_b32 s10, 3 +; GFX9-NEXT: s_mov_b32 s11, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10504,35 +10623,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: s_mov_b32 s10, 3 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: s_mov_b32 s11, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10550,31 +10672,39 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 3 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10592,30 +10722,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10639,17 +10777,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x4400 +; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10667,17 +10807,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10695,18 +10837,20 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10724,17 +10868,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10756,17 +10902,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 4.0 +; GFX9-NEXT: s_mov_b32 s4, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10784,17 +10932,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10812,18 +10962,20 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10841,17 +10993,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10873,18 +11027,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10902,18 +11060,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10931,19 +11093,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10961,24 +11127,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg ) @@ -10994,19 +11164,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 4.0 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11024,19 +11200,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11054,20 +11236,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11085,19 +11273,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11119,23 +11313,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 4.0 -; GFX9-NEXT: s_mov_b32 s3, -1.0 -; GFX9-NEXT: s_mov_b32 s4, 0.5 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: s_mov_b32 s7, -1.0 +; GFX9-NEXT: s_mov_b32 s8, 0.5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 +; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11153,23 +11355,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 4.0 -; GFX10-NEXT: s_mov_b32 s3, -1.0 -; GFX10-NEXT: s_mov_b32 s4, 0.5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, -1.0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 0.5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11187,24 +11397,32 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 4.0 -; GFX11-NEXT: s_mov_b32 s3, -1.0 -; GFX11-NEXT: s_mov_b32 s4, 0.5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, -1.0 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0.5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11222,23 +11440,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, -1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11260,18 +11486,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 0x40100000 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11289,18 +11519,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11318,19 +11552,23 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11348,18 +11586,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11381,20 +11623,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s3, 0x40100000 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11412,20 +11662,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11443,21 +11701,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11475,20 +11741,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11510,26 +11784,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s3, 0x40100000 ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, 0x40200000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 +; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11547,26 +11829,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 0x40200000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11584,27 +11874,35 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40100000 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40200000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 0x40200000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11622,26 +11920,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11663,17 +11969,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11691,17 +11999,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11719,18 +12029,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11748,17 +12060,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11781,20 +12095,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11812,20 +12127,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11843,18 +12159,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11872,17 +12192,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11905,20 +12229,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11936,20 +12261,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11967,18 +12293,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11996,17 +12326,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12029,18 +12363,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x20001 -; GFX9-NEXT: s_mov_b32 s1, 3 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 3 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12058,18 +12396,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-NEXT: s_mov_b32 s1, 3 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12087,19 +12429,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x20001 -; GFX11-NEXT: s_mov_b32 s1, 3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12117,18 +12463,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12150,18 +12500,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX9-NEXT: s_movk_i32 s1, 0x4400 +; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX9-NEXT: s_movk_i32 s5, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12179,18 +12533,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX10-NEXT: s_movk_i32 s1, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12208,19 +12566,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX11-NEXT: s_movk_i32 s1, 0x4400 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_movk_i32 s5, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12238,18 +12600,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX10-SCRATCH-NEXT: s_movk_i32 s1, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12271,20 +12637,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12302,20 +12669,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12333,18 +12701,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12362,19 +12734,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12395,18 +12771,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x20001 -; GFX9-NEXT: s_mov_b32 s1, 0x40003 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 0x40003 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12424,18 +12804,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-NEXT: s_mov_b32 s1, 0x40003 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12453,19 +12837,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x20001 -; GFX11-NEXT: s_mov_b32 s1, 0x40003 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40003 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12483,18 +12871,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40003 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12516,17 +12908,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12544,17 +12938,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12572,18 +12968,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12601,17 +12999,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12634,20 +13034,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12665,20 +13066,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12696,18 +13098,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12725,17 +13131,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12758,18 +13168,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12787,18 +13201,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12816,19 +13234,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12846,18 +13268,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12879,19 +13305,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 3 -; GFX9-NEXT: s_mov_b32 s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 5 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12909,19 +13341,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 3 -; GFX10-NEXT: s_mov_b32 s1, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12939,20 +13377,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 3 -; GFX11-NEXT: s_mov_b32 s1, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12970,19 +13414,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13004,20 +13454,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 3 -; GFX9-NEXT: s_mov_b32 s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 5 -; GFX9-NEXT: s_mov_b32 s3, 6 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: s_mov_b32 s7, 6 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13035,20 +13493,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 3 -; GFX10-NEXT: s_mov_b32 s1, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 5 -; GFX10-NEXT: s_mov_b32 s3, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 6 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13066,21 +13532,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 3 -; GFX11-NEXT: s_mov_b32 s1, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 5 -; GFX11-NEXT: s_mov_b32 s3, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 6 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13098,20 +13572,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13133,22 +13615,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13166,22 +13651,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13199,18 +13687,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13228,17 +13724,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13261,20 +13765,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13292,20 +13804,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13323,21 +13843,29 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13355,20 +13883,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13390,23 +13926,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 -; GFX9-NEXT: s_mov_b32 s4, 5 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 +; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13424,23 +13968,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 -; GFX10-NEXT: s_mov_b32 s4, 5 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13458,24 +14010,32 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 -; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13493,23 +14053,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13531,36 +14099,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13578,36 +14145,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13625,28 +14191,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13664,27 +14238,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13708,32 +14290,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 -; GFX9-NEXT: s_mov_b32 s4, 5 -; GFX9-NEXT: s_mov_b32 s5, 6 -; GFX9-NEXT: s_mov_b32 s6, 7 -; GFX9-NEXT: s_mov_b32 s7, 8 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: s_mov_b32 s9, 6 +; GFX9-NEXT: s_mov_b32 s10, 7 +; GFX9-NEXT: s_mov_b32 s11, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13751,32 +14341,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 -; GFX10-NEXT: s_mov_b32 s4, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 6 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 7 +; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 6 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: s_mov_b32 s10, 7 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: s_mov_b32 s11, 8 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13794,33 +14392,41 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 -; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 6 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 7 +; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 8 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 6 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 7 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 8 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13838,32 +14444,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13885,47 +14499,38 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 14 +; GFX9-NEXT: v_writelane_b32 v40, s34, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 -; GFX9-NEXT: v_writelane_b32 v40, s30, 12 +; GFX9-NEXT: v_writelane_b32 v40, s16, 12 +; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 13 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 13 -; GFX9-NEXT: v_readlane_b32 s30, v40, 12 +; GFX9-NEXT: v_readlane_b32 s31, v40, 17 +; GFX9-NEXT: v_readlane_b32 s30, v40, 16 +; GFX9-NEXT: v_readlane_b32 s19, v40, 15 +; GFX9-NEXT: v_readlane_b32 s18, v40, 14 +; GFX9-NEXT: v_readlane_b32 s17, v40, 13 +; GFX9-NEXT: v_readlane_b32 s16, v40, 12 ; GFX9-NEXT: v_readlane_b32 s15, v40, 11 ; GFX9-NEXT: v_readlane_b32 s14, v40, 10 ; GFX9-NEXT: v_readlane_b32 s13, v40, 9 @@ -13938,7 +14543,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 14 +; GFX9-NEXT: v_readlane_b32 s34, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13956,47 +14561,38 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 14 +; GFX10-NEXT: v_writelane_b32 v40, s34, 18 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: v_writelane_b32 v40, s12, 8 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 ; GFX10-NEXT: v_writelane_b32 v40, s13, 9 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 ; GFX10-NEXT: v_writelane_b32 v40, s14, 10 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 ; GFX10-NEXT: v_writelane_b32 v40, s15, 11 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: v_writelane_b32 v40, s30, 12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 13 +; GFX10-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 13 -; GFX10-NEXT: v_readlane_b32 s30, v40, 12 +; GFX10-NEXT: v_readlane_b32 s31, v40, 17 +; GFX10-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-NEXT: v_readlane_b32 s16, v40, 12 ; GFX10-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-NEXT: v_readlane_b32 s13, v40, 9 @@ -14009,7 +14605,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 14 +; GFX10-NEXT: v_readlane_b32 s34, v40, 18 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14027,10 +14623,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 14 +; GFX11-NEXT: v_writelane_b32 v40, s0, 18 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14044,14 +14638,24 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s13, 9 ; GFX11-NEXT: v_writelane_b32 v40, s14, 10 ; GFX11-NEXT: v_writelane_b32 v40, s15, 11 +; GFX11-NEXT: v_writelane_b32 v40, s16, 12 +; GFX11-NEXT: v_writelane_b32 v40, s17, 13 +; GFX11-NEXT: v_writelane_b32 v40, s18, 14 +; GFX11-NEXT: v_writelane_b32 v40, s19, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 13 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 13 -; GFX11-NEXT: v_readlane_b32 s30, v40, 12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 17 +; GFX11-NEXT: v_readlane_b32 s30, v40, 16 +; GFX11-NEXT: v_readlane_b32 s19, v40, 15 +; GFX11-NEXT: v_readlane_b32 s18, v40, 14 +; GFX11-NEXT: v_readlane_b32 s17, v40, 13 +; GFX11-NEXT: v_readlane_b32 s16, v40, 12 ; GFX11-NEXT: v_readlane_b32 s15, v40, 11 ; GFX11-NEXT: v_readlane_b32 s14, v40, 10 ; GFX11-NEXT: v_readlane_b32 s13, v40, 9 @@ -14064,7 +14668,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 14 +; GFX11-NEXT: v_readlane_b32 s0, v40, 18 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14082,10 +14686,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14099,13 +14701,23 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 13 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 12 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 @@ -14118,7 +14730,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14159,47 +14771,49 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 -; GFX9-NEXT: v_writelane_b32 v40, s28, 24 -; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: v_writelane_b32 v40, s30, 26 -; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s28, 24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s30 -; GFX9-NEXT: v_mov_b32_e32 v1, s31 +; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_writelane_b32 v40, s29, 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: v_mov_b32_e32 v2, s48 +; GFX9-NEXT: v_mov_b32_e32 v3, s49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: v_writelane_b32 v40, s30, 26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: v_writelane_b32 v40, s31, 27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -14265,46 +14879,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 +; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -14355,8 +14970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -14373,26 +14988,42 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: s_mov_b32 s23, s39 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31 -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -14440,8 +15071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -14458,29 +15090,44 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 +; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 +; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -14549,53 +15196,55 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s52 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: v_mov_b32_e32 v2, s48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s49 ; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 ; GFX9-NEXT: v_writelane_b32 v40, s30, 26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 ; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s30 -; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX9-NEXT: v_mov_b32_e32 v1, s31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -14661,51 +15310,52 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s52 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: s_mov_b32 s24, s40 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s52 -; GFX10-NEXT: v_mov_b32_e32 v1, s30 -; GFX10-NEXT: v_mov_b32_e32 v2, s31 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -14756,8 +15406,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_add_i32 s36, s32, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -14774,30 +15424,46 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: v_mov_b32_e32 v2, s48 +; GFX11-NEXT: s_add_i32 s2, s32, 24 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: s_mov_b32 s23, s39 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: scratch_store_b32 off, v6, s2 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x0 -; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v1, s31 -; GFX11-NEXT: v_mov_b32_e32 v0, s30 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX11-NEXT: scratch_store_b32 off, v2, s36 -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -14845,17 +15511,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_load_dword s36, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s36 -; GFX10-SCRATCH-NEXT: s_add_i32 s36, s32, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 @@ -14868,29 +15530,50 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x2 +; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 +; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 +; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s36 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -16714,7 +17397,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16741,7 +17423,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16761,18 +17442,18 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-LABEL: test_call_external_void_func_bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -16788,19 +17469,19 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -16830,7 +17511,6 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16857,7 +17537,6 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16877,18 +17556,18 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -16904,19 +17583,19 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -16946,7 +17625,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16973,7 +17651,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16993,18 +17670,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17020,19 +17697,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17062,8 +17739,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17090,10 +17765,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17111,18 +17784,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s3, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s3 -; GFX11-NEXT: v_writelane_b32 v40, s2, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17138,19 +17811,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17180,8 +17853,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17208,10 +17879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17229,18 +17898,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s3, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s3 -; GFX11-NEXT: v_writelane_b32 v40, s2, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17256,19 +17925,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17298,10 +17967,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s3, s7 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17328,12 +17993,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s3, s7 -; GFX10-NEXT: s_mov_b32 s2, s6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17351,18 +18012,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s35, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s35 -; GFX11-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17378,19 +18039,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17416,32 +18077,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s3, s7 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s9 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -17459,32 +18104,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s3, s7 -; GFX10-NEXT: s_mov_b32 s2, s6 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, s9 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, s10 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, s11 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -17497,18 +18126,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s35, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s35 -; GFX11-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17524,19 +18153,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 25c6840..7799b95 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -847,11 +847,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -885,19 +885,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_movk_i32 s0, 0x7b -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] +; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -930,22 +930,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -979,19 +979,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s6, v0 -; GISEL-NEXT: v_readfirstlane_b32 s7, v1 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: s_movk_i32 s0, 0x7b -; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GISEL-NEXT: v_readfirstlane_b32 s8, v0 +; GISEL-NEXT: v_readfirstlane_b32 s9, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1024,11 +1024,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll index 27e9d0f..0139c52 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll @@ -10,7 +10,7 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3 ; CHECK-NEXT: scratch_load_b32 v2, v0, off ; CHECK-NEXT: scratch_load_b32 v3, v3, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[4:7], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] %x0 = load i32, ptr addrspace(5) %src diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll index ec1de02..cdaac14 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll @@ -6,7 +6,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GCN-LABEL: sink_scratch_pointer: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: scratch_load_b32 v0, v0, off offset:-4 @@ -21,7 +21,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GISEL-LABEL: sink_scratch_pointer: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_cbranch_scc0 .LBB0_2 ; GISEL-NEXT: ; %bb.1: ; %bb2 ; GISEL-NEXT: scratch_load_b32 v0, v0, off offset:-4 -- cgit v1.1 From ef316da4a2c5954a02c92707b5cb621402b76910 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 27 Mar 2024 00:56:47 +0530 Subject: AMDGPU: Fix dead check prefixes in test --- .../CodeGen/AMDGPU/global_atomics_i32_system.ll | 564 --------------------- .../CodeGen/AMDGPU/global_atomics_i64_system.ll | 564 --------------------- 2 files changed, 1128 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 76ec1cc..99d02ff 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -358,65 +358,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) ; --------------------------------------------------------------------- define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB0_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB0_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB0_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -450,69 +391,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) { } define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB1_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB1_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB1_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -549,71 +427,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i } define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB2_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v0, v4 -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB2_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v0, v4 -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB2_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -648,73 +461,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) { } define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f32_e32 v4, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB3_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB3_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB3_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -752,80 +498,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in } define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB4_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB4_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB4_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -876,84 +548,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr } define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB5_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB5_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB5_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1007,83 +601,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace } define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB6_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB6_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB6_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1134,87 +651,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre } define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB7_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB7_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB7_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index d137f47..380ce7f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -372,65 +372,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ; --------------------------------------------------------------------- define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB0_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB0_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB0_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -464,69 +405,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) { } define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f64_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB1_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB1_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB1_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -563,71 +441,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double % } define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB2_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v0, v4 -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB2_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v0, v4 -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB2_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -663,73 +476,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) { } define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f64_e32 v4, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB3_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB3_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB3_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -768,80 +514,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double % } define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB4_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB4_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB4_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,84 +568,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr } define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB5_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB5_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB5_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1029,83 +623,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace } define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB6_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB6_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB6_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1160,87 +677,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr } define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB7_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB7_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB7_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -- cgit v1.1 From f5296df97c6bdc6cb658691e5863fdbf336d4430 Mon Sep 17 00:00:00 2001 From: "Kevin P. Neal" <52762977+kpneal@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:20:00 -0400 Subject: [FPEnv][AMDGPU] Correct AMDGPUSimplifyLibCalls handling of strictfp attribute. (#86705) The AMDGPUSimplifyLibCalls pass was lowering function calls with the strictfp attribute to sequences that included function calls incorrectly lacking the attribute. This patch corrects that. The pass now also emits the correct constrained fp call instead of normal FP instructions when in a function with the strictfp attribute. Replacing non-constrained calls with constrained calls when required is still on the IRBuilder's TODO list. --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll | 12 ++++++------ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll index 942f459..8ddaf24 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll @@ -808,7 +808,7 @@ define float @test_pown_fast_f32_nobuiltin(float %x, i32 %y) { ; CHECK-LABEL: define float @test_pown_fast_f32_nobuiltin ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: ret float [[CALL]] ; entry: @@ -820,11 +820,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 { ; CHECK-LABEL: define float @test_pown_fast_f32_strictfp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) -; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) +; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]] +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]] +; CHECK-NEXT: [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]] +; CHECK-NEXT: [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]] +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]] ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index 2ffa647..2e64a34 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -896,7 +896,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 { ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp( ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) +; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]] ; CHECK-NEXT: ret float [[__ROOTN2RSQRT]] ; entry: -- cgit v1.1 From 0a43ca731b1faedd885f86153ecc570dde602ca3 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 27 Mar 2024 17:40:58 -0400 Subject: [AMDGPU] Fix missing `IsExact` flag when expanding vector binary operator (#86712) --- .../CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll | 108 +++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index d900165..2ad28b8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -10668,3 +10668,111 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x store <2 x i64> %r, ptr addrspace(1) %out ret void } + +define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) { +; CHECK-LABEL: @v_sdiv_i32_exact( +; CHECK: %1 = extractelement <2 x i32> %num, i64 0 +; CHECK-NEXT: %2 = sdiv exact i32 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 +; CHECK-NEXT: %5 = sdiv exact i32 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 +; CHECK-NEXT: ret <2 x i32> %6 +; +; GFX6-LABEL: v_sdiv_i32_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sdiv_i32_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i32> %num, + ret <2 x i32> %result +} + +define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) { +; CHECK-LABEL: @v_sdiv_i64_exact( +; CHECK: %1 = extractelement <2 x i64> %num, i64 0 +; CHECK-NEXT: %2 = sdiv exact i64 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 +; CHECK-NEXT: %5 = sdiv exact i64 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 +; CHECK-NEXT: ret <2 x i64> %6 +; +; GFX6-LABEL: v_sdiv_i64_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sdiv_i64_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 12, v[0:1] +; GFX9-NEXT: v_ashrrev_i64 v[2:3], 10, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i64> %num, + ret <2 x i64> %result +} + +define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) { +; CHECK-LABEL: @v_udiv_i32_exact( +; CHECK: %1 = extractelement <2 x i32> %num, i64 0 +; CHECK-NEXT: %2 = udiv exact i32 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 +; CHECK-NEXT: %5 = udiv exact i32 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 +; CHECK-NEXT: ret <2 x i32> %6 +; +; GFX6-LABEL: v_udiv_i32_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 10, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_udiv_i32_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 12, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = udiv exact <2 x i32> %num, + ret <2 x i32> %result +} + +define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) { +; CHECK-LABEL: @v_udiv_i64_exact( +; CHECK: %1 = extractelement <2 x i64> %num, i64 0 +; CHECK-NEXT: %2 = udiv exact i64 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 +; CHECK-NEXT: %5 = udiv exact i64 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 +; CHECK-NEXT: ret <2 x i64> %6 +; +; GFX6-LABEL: v_udiv_i64_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 10 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_udiv_i64_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 12, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 10, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = udiv exact <2 x i64> %num, + ret <2 x i64> %result +} -- cgit v1.1 From 23d45e55edb0ca4567f5876e7051ff4a649213df Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 28 Mar 2024 14:43:49 -0700 Subject: [MCP] Remove dead copies from basic blocks with successors. (#86973) Previously we wouldn't remove dead copies from basic blocks with successors. The comment said we didn't want to trust the live-in lists. The comment is very old so I'm not sure if that's still a concern today. This patch checks the live-in lists and removes copies from MaybeDeadCopies if they are referenced by any live-ins in any successors. We only do this if the tracksLiveness property is set. If that property is not set, we retain the old behavior. --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll | 2 -- 1 file changed, 2 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 1ebd864..2970495 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -477,7 +477,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: s_cbranch_execz .LBB1_3 ; GFX1032-NEXT: ; %bb.2: ; GFX1032-NEXT: v_mov_b32_e32 v0, s11 -; GFX1032-NEXT: s_mov_b32 s10, s11 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 @@ -615,7 +614,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_cbranch_execz .LBB1_3 ; GFX1132-NEXT: ; %bb.2: ; GFX1132-NEXT: v_mov_b32_e32 v0, s11 -; GFX1132-NEXT: s_mov_b32 s10, s11 ; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -- cgit v1.1 From 661bb9daae8730868d87b7a3cee6b9ad51e988af Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 29 Mar 2024 12:41:47 -0400 Subject: [GlobalISel] Handle div-by-pow2 (#83155) This patch adds similar handling of div-by-pow2 as in `SelectionDAG`. --- .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 41 +- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 147 +- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 644 +----- llvm/test/CodeGen/AMDGPU/div_i128.ll | 2298 +++++++++++++++++++- 4 files changed, 2390 insertions(+), 740 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 6eed92b..6d4aa3b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -670,36 +670,19 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, -2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_ashr_i32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %src = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 1061f00..2c2f8e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -279,125 +279,27 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x1000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v5, 0xfffff000 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v3 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result } @@ -884,3 +786,24 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { %result = sdiv <2 x i32> %num.mask, %den.mask ret <2 x i32> %result } + +define i32 @v_sdiv_i32_exact(i32 %num) { +; CHECK-LABEL: v_sdiv_i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i32 %num, 4096 + ret i32 %result +} + +define <2 x i32> @v_sdiv_v2i32_exact(<2 x i32> %num) { +; CHECK-LABEL: v_sdiv_v2i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i32> %num, + ret <2 x i32> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 84906c0..377fa24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -999,602 +999,45 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: s_setpc_b64 s[30:31] - %result = sdiv i64 %num, 4096 - ret i64 %result -} - -define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { -; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv i64 %num, 4096 + ret i64 %result +} + +define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { +; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v5 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; GISEL-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 -; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v18, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CGP-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -3398,3 +2841,24 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { %result = sdiv <2 x i64> %num.mask, %den.mask ret <2 x i64> %result } + +define i64 @v_sdiv_i64_exact(i64 %num) { +; CHECK-LABEL: v_sdiv_i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i64 %num, 4096 + ret i64 %result +} + +define <2 x i64> @v_sdiv_v2i64_exact(<2 x i64> %num) { +; CHECK-LABEL: v_sdiv_v2i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i64> %num, + ret <2 x i64> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 2f3d5d9..cf99b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s -; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671 -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s -; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s +; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_sdiv_i128_vv: @@ -1223,6 +1222,1158 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v17, v6 +; GFX9-G-NEXT: v_subb_co_u32_e32 v19, vcc, v1, v17, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v17, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v17, vcc +; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 +; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 +; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 +; GFX9-G-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v4 +; GFX9-G-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v5 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] +; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 +; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 +; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v0, s[6:7], v0, v1 +; GFX9-G-NEXT: v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v6, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 +; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0 +; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 +; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 +; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 +; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 +; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB0_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: .LBB0_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v12, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v1, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v10, v2 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v9, v6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[6:7], v5, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_5 +; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_9 +; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_3 +; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-G-O0-NEXT: s_branch .LBB0_1 +; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_6 +; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-O0-NEXT: s_branch .LBB0_7 +; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v5 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v8 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, %rhs ret i128 %div } @@ -2306,6 +3457,1043 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-G-NEXT: v_or_b32_e32 v9, v5, v7 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v4 +; GFX9-G-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX9-G-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v6 +; GFX9-G-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] +; GFX9-G-NEXT: v_add_u32_e32 v8, 64, v8 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v1 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v11, v2 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v3 +; GFX9-G-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_add_u32_e32 v9, 64, v9 +; GFX9-G-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v12, s[6:7], v8, v9 +; GFX9-G-NEXT: v_subb_co_u32_e64 v13, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v8, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8 +; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12 +; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 +; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v14, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v15, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v16, vcc, 0x7f, v12 +; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3] +; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 +; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 +; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[2:3], v22, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc +; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4 +; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 +; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6 +; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc +; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 +; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc +; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB1_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1 +; GFX9-G-NEXT: .LBB1_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-G-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s13, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_5 +; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_9 +; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_3 +; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-G-O0-NEXT: s_branch .LBB1_1 +; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_6 +; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-O0-NEXT: s_branch .LBB1_7 +; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, %rhs ret i128 %div } @@ -2388,6 +4576,66 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-G-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5] +; GFX9-G-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v0, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v2, v2, v4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, 8589934592 ret i128 %div } @@ -2434,10 +4682,42 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, 8589934592 ret i128 %div } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9-SDAG: {{.*}} -; GFX9-SDAG-O0: {{.*}} -- cgit v1.1 From 3a106e5b2cd9f4073b2961b991ebaeee96786309 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 29 Mar 2024 15:59:50 -0400 Subject: [GlobalISel] Fold G_ICMP if possible (#86357) This patch tries to fold `G_ICMP` if possible. --- .../legalize-atomic-cmpxchg-with-success.mir | 15 +++++--- .../CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir | 15 +++++--- .../CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir | 45 +++++++++++++--------- .../CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir | 15 +++++--- .../CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir | 45 +++++++++++++--------- 5 files changed, 84 insertions(+), 51 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir index e288d9d..eafd1e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir @@ -16,7 +16,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -40,7 +41,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p0), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32)) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -63,7 +65,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -87,7 +90,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY1]](s64) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = COPY $vgpr4_vgpr5 @@ -110,7 +114,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s64) = COPY $vgpr1_vgpr2 %2:_(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir index dba20e1..eb86a98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir index 93d0071..80b3166 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: saddsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: saddsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir index 57b1ab9..220450c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 33a8cda..49fb6e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: ssubsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: ssubsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 -- cgit v1.1 From 95258419f6fe2e0922c2c0916fd176b9f7361555 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Sat, 30 Mar 2024 08:01:18 +0000 Subject: [AMDGPU] Use AMDGPU::isIntrinsicAlwaysUniform in isSDNodeAlwaysUniform (#87085) This is mostly just a simplification, but tests show a slight codegen improvement in code using the deprecated amdgcn.icmp/fcmp intrinsics. --- llvm/test/CodeGen/AMDGPU/wave32.ll | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 82816b4..901e88a 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2479,8 +2479,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -2494,8 +2493,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2529,10 +2527,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] @@ -2576,9 +2571,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 ; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 ; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 @@ -2592,8 +2586,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2609,15 +2602,15 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 -; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 +; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 ; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 @@ -2625,10 +2618,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -- cgit v1.1 From 0234d90d818204f3a575de744a8df8448a7adeca Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Sun, 31 Mar 2024 10:46:05 -0700 Subject: [AMDGPU] Extend MFMA padding option to gfx90a+ (#86768) It was shown experimentally that this may have some benefit on newer HW. --- .../CodeGen/AMDGPU/neighboring-mfma-padding.mir | 504 +++++++++++++++++++++ 1 file changed, 504 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir index 3de258b..bf2cf6a 100644 --- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir +++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir @@ -5,6 +5,14 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=75 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD75 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD100 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-DEFAULT %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD50 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD100 %s + +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-DEFAULT %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD50 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD100 %s + --- name: mfma_padding_2_pass body: | @@ -31,6 +39,35 @@ body: | ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 1 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 0 + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 1 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: S_NOP 1 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 1 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ... @@ -64,6 +101,40 @@ body: | ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 0 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 0 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: S_NOP 0 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 0 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 0 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec @@ -100,6 +171,41 @@ body: | ; gfx908-PAD100-NEXT: DBG_VALUE ; gfx908-PAD100-NEXT: S_NOP 1 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: DBG_VALUE + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: DBG_VALUE + ; gfx90a-PAD50-NEXT: S_NOP 0 + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: DBG_VALUE + ; gfx90a-PAD100-NEXT: S_NOP 1 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: DBG_VALUE + ; gfx940-DEFAULT-NEXT: S_NOP 1 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: DBG_VALUE + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: DBG_VALUE + ; gfx940-PAD100-NEXT: S_NOP 1 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec DBG_VALUE $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec @@ -132,6 +238,34 @@ body: | ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 3 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 3 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -172,6 +306,46 @@ body: | ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 5 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 1 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 5 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 5 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -207,6 +381,36 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 7 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 7 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -258,6 +462,60 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 3 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 3 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 3 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 3 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 3 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -369,6 +627,126 @@ body: | ; gfx908-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -414,6 +792,30 @@ body: | ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -506,6 +908,108 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 5 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-DEFAULT: bb.0: + ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: bb.1: + ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: bb.2: + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-PAD50: bb.0: + ; gfx90a-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: bb.1: + ; gfx90a-PAD50-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: bb.2: + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 5 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-PAD100: bb.0: + ; gfx90a-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: bb.1: + ; gfx90a-PAD100-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: bb.2: + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 5 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-DEFAULT: bb.0: + ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: bb.1: + ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x80000000) + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: bb.2: + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-PAD50: bb.0: + ; gfx940-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: bb.1: + ; gfx940-PAD50-NEXT: successors: %bb.2(0x80000000) + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: bb.2: + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 5 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-PAD100: bb.0: + ; gfx940-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: bb.1: + ; gfx940-PAD100-NEXT: successors: %bb.2(0x80000000) + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: bb.2: + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 5 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec bb.0: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec S_CBRANCH_VCCZ %bb.2, implicit undef $vcc -- cgit v1.1 From b5b34dbb27359139ef1eb2ca22e8c5a954e34e50 Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Sun, 31 Mar 2024 11:03:03 -0700 Subject: [AMDGPU] Use directive for kernarg preload header padding (#86004) --- llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll | 11 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 9050 +------------------- 2 files changed, 316 insertions(+), 8745 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index a70488a..a030f86 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -1,17 +1,20 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA,ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA,OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s ; GCN: preload_kernarg_header ; HSA: s_trap 2 ; NON-HSA: s_endpgm -; GCN-COUNT-63: s_nop 0 +; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0 +; OBJ-COUNT-63: s_nop 0 define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { store ptr %arg, ptr %arg ret void } ; GCN: non_kernel_function +; GCN-NOT: s_trap 2 ; GCN-NOT: s_nop 0 ; GCN: flat_store define void @non_kernel_function(ptr %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index d20c3a4..f0e709b 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -24,70 +24,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i8: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -98,70 +36,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -170,70 +46,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i8: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -242,70 +56,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -325,70 +77,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i8: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -399,70 +89,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -471,70 +99,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i8: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -543,70 +109,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -631,70 +135,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -705,70 +147,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 @@ -778,70 +158,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 @@ -851,70 +169,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 @@ -935,70 +191,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1009,70 +203,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 @@ -1082,70 +214,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 @@ -1155,70 +225,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 @@ -1244,70 +252,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1318,70 +264,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -1390,70 +274,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -1462,70 +284,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -1545,70 +305,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1619,70 +317,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -1691,70 +327,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -1763,70 +337,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -1850,70 +362,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1923,70 +373,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 @@ -1994,70 +382,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 @@ -2065,70 +391,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 @@ -2146,70 +410,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -2219,70 +421,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 @@ -2290,70 +430,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 @@ -2361,70 +439,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 @@ -2449,70 +465,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 @@ -2524,70 +478,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -2598,70 +490,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -2670,70 +500,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -2754,70 +522,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2829,70 +535,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -2903,70 +547,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -2975,70 +557,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -3065,70 +585,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3141,70 +599,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff @@ -3217,70 +613,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff @@ -3291,70 +625,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff @@ -3378,70 +650,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3454,70 +664,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff @@ -3530,70 +678,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s1, s8, 0xffff @@ -3604,70 +690,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff @@ -3695,70 +719,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3768,70 +730,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3841,70 +741,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3914,70 +752,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3997,70 +773,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4070,70 +784,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4143,70 +795,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4216,70 +806,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4308,70 +836,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: byref_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4385,70 +851,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: byref_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -4462,70 +866,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: byref_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -4539,70 +881,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: byref_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -4630,70 +910,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4707,70 +925,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -4784,70 +940,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -4861,70 +955,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -4964,70 +996,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v8i32_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 @@ -5046,70 +1016,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i32_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 @@ -5128,70 +1036,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v8i32_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 @@ -5210,70 +1056,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i32_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 @@ -5311,70 +1095,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v8i32_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 @@ -5393,79 +1115,17 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i32_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 @@ -5475,70 +1135,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v8i32_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 @@ -5557,70 +1155,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i32_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 @@ -5654,70 +1190,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -5729,70 +1203,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 @@ -5802,70 +1214,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 @@ -5875,70 +1225,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 @@ -5959,70 +1247,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -6034,70 +1260,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 @@ -6107,70 +1271,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 @@ -6180,70 +1282,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 @@ -6269,70 +1309,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6344,70 +1322,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 @@ -6417,70 +1333,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s7 @@ -6490,70 +1344,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 @@ -6575,70 +1367,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6650,70 +1380,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 @@ -6723,70 +1391,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 @@ -6796,70 +1402,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 @@ -6885,70 +1429,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6960,70 +1442,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 @@ -7033,70 +1453,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6 @@ -7106,70 +1464,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 @@ -7191,70 +1487,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -7266,70 +1500,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 @@ -7339,70 +1511,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 @@ -7412,70 +1522,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 @@ -7500,70 +1548,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -7575,70 +1561,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7655,70 +1579,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7735,70 +1597,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7826,70 +1626,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -7901,70 +1639,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7981,70 +1657,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -8061,70 +1675,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -8167,70 +1719,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v5f64_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8252,70 +1742,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5f64_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8337,70 +1765,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v5f64_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8422,70 +1788,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5f64_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8529,70 +1833,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v5f64_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8614,70 +1856,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5f64_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8699,70 +1879,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v5f64_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8784,82 +1902,20 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5f64_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 @@ -8882,70 +1938,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -8955,70 +1949,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9042,70 +1974,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s5, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9129,86 +1999,24 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX940-PRELOAD-8-NEXT: s_nop 0 ; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 @@ -9225,70 +2033,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9298,70 +2044,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9384,70 +2068,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9470,70 +2092,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9570,70 +2130,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9643,70 +2141,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9714,70 +2150,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9785,70 +2159,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9866,70 +2178,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9939,70 +2189,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10010,70 +2198,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10081,70 +2207,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10166,70 +2230,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -10239,70 +2241,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10310,70 +2250,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10381,70 +2259,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10462,70 +2278,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -10535,70 +2289,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10606,70 +2298,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10677,70 +2307,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -- cgit v1.1 From 216b5e96664f72fdb63b6bbd6c422185c67ef818 Mon Sep 17 00:00:00 2001 From: "Ruiling, Song" Date: Mon, 1 Apr 2024 09:48:37 +0800 Subject: [AMDGPU] Expose RTZ version of f16 interpolation for gfx11+ (#86614) --- .../AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll | 30 ++++++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll | 30 ++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index 623360f..de46037 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -147,6 +147,34 @@ main_body: ret half %res } +define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_rtz_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GCN-LABEL: v_interp_f16_imm_params: ; GCN: ; %bb.0: ; %main_body @@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll index 429528e..e3dd036 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -147,6 +147,34 @@ main_body: ret half %res } +define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_rtz_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GCN-LABEL: v_interp_f16_imm_params: ; GCN: ; %bb.0: ; %main_body @@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 -- cgit v1.1 From 20f56e1f8e51d672425ec0c8f2ec243b131e8296 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sun, 31 Mar 2024 22:19:33 -0700 Subject: [CodeGen] Add default lowering for llvm.allow.{runtime,ubsan}.check() (#86049) RFC: https://discourse.llvm.org/t/rfc-add-llvm-experimental-hot-intrinsic-or-llvm-hot/77641 --- llvm/test/CodeGen/AMDGPU/allow-check.ll | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/allow-check.ll (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/allow-check.ll b/llvm/test/CodeGen/AMDGPU/allow-check.ll new file mode 100644 index 0000000..d4f5621 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/allow-check.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=0 | FileCheck %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=1 | FileCheck %s + +define i1 @test_runtime() local_unnamed_addr { +; CHECK-LABEL: test_runtime: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %allow = call i1 @llvm.allow.runtime.check(metadata !"test_check") + ret i1 %allow +} + +declare i1 @llvm.allow.runtime.check(metadata) nounwind + +define i1 @test_ubsan() local_unnamed_addr { +; CHECK-LABEL: test_ubsan: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %allow = call i1 @llvm.allow.ubsan.check(i8 7) + ret i1 %allow +} + +declare i1 @llvm.allow.ubsan.check(i8) nounwind -- cgit v1.1 From 421557974a3e0f469e6f4c3caecbf8aba69bb5bf Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Mon, 1 Apr 2024 10:51:13 +0530 Subject: [AMDGPU] Use glue for convergence tokens at call-like operations (#86766) The earlier implementation on AMDGPU used explicit token operands at SI_CALL and SI_CALL_ISEL. This is now replaced with CONVERGENCECTRL_GLUE operands, with the following effects: - The treatment of tokens at call-like operations is now consistent with the treatment at intrinsics. - Support for tail calls using implicit tokens at SI_TCRETURN "just works". - The extra parameter at call-like instructions is eliminated, thus restoring those instructions and their handling to the original state. The new glue node is placed after the existing glue node for the outgoing call parameters, which seems to not interfere with selection of the call-like nodes. --- llvm/test/CodeGen/AMDGPU/convergence-tokens.ll | 12 +--- .../test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 18 ----- .../AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll | 1 - .../CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll | 18 ++--- .../AMDGPU/no-source-locations-in-prologue.ll | 1 - .../CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll | 15 ++--- llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll | 78 ++++++++++------------ llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 26 ++++---- llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 1 - .../CodeGen/AMDGPU/whole-wave-register-spill.ll | 1 - llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 2 - llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 4 -- 12 files changed, 64 insertions(+), 113 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 6beccce..1c8725f 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: name: basic_call ; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY -; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}} +; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] ; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] ; GISEL: {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] define i32 @basic_call(i32 %src) #0 { @@ -92,15 +92,9 @@ define i32 @nested(i32 %src) #0 { ret i32 %sum } -; COM: FIXME: Tokens on tail-call have not been implemented for SelectionDAG -; COM: yet; the corresponding checks have been commented out. -; ; CHECK-LABEL: name: tail_call_void_func_void -; GISEL: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY -; COM: CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY -; COM: ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @external_void_func_void, [[TOKEN]], csr_amdgpu, {{.*}} -; COM: DEADMI: {{.*}} SI_CALL {{.*}}, @external_void_func_void, csr_amdgpu, {{.*}}, implicit [[TOKEN]] -; GISEL: {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, implicit [[TOKEN]] +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; CHECK: {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, {{.*}}implicit [[TOKEN]] define void @tail_call_void_func_void() #0 { %t1 = call token @llvm.experimental.convergence.entry() tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ] diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll index e015095a..ab160ff 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll @@ -92,7 +92,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX11-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -122,7 +121,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX10-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -234,7 +232,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -272,7 +269,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -404,7 +400,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -454,7 +449,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -506,7 +500,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -524,7 +517,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -576,7 +568,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -594,7 +585,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -646,7 +636,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -664,7 +653,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -716,7 +704,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -734,7 +721,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -870,7 +856,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -916,7 +901,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -2480,7 +2464,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX11-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX11-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX11-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -2827,7 +2810,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX10-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX10-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX10-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 1acbb09..fbf2ee1 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -60,7 +60,6 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index cbdc7bb..69971bc 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -43,7 +43,6 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 @@ -55,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -88,7 +87,6 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -148,7 +146,6 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -173,7 +170,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -188,7 +185,6 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 @@ -196,7 +192,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -208,7 +204,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s25, s33 +; CHECK-NEXT: s_mov_b32 s19, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -223,7 +219,6 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 @@ -231,7 +226,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s25 +; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -263,7 +258,6 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 34e67d0..9999cb9 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -32,7 +32,6 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 764f494..f523b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,7 +16,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill @@ -150,7 +150,6 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v255, 1 @@ -270,7 +269,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -311,7 +310,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -444,7 +443,6 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v254, 1 @@ -563,7 +561,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -1530,7 +1528,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1668,7 +1666,6 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 1 @@ -1801,7 +1798,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index b8bc01e..c6a5990 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -916,13 +916,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-O0-NEXT: s_getpc_b64 s[24:25] -; WAVE32-O0-NEXT: s_mov_b32 s24, s0 -; WAVE32-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-O0-NEXT: s_getpc_b64 s[20:21] +; WAVE32-O0-NEXT: s_mov_b32 s20, s0 +; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 @@ -934,17 +934,17 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-O0-NEXT: s_mov_b32 s6, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1018,11 +1018,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0 @@ -1137,7 +1136,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload @@ -1155,13 +1153,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s0 -; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s20, s0 +; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 @@ -1174,13 +1172,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1254,7 +1252,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s1, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s0, v32, 0 @@ -1347,7 +1344,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s26, s33 +; WAVE32-O0-NEXT: s_mov_b32 s25, s33 ; WAVE32-O0-NEXT: s_mov_b32 s33, s32 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1361,9 +1358,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1440,11 +1437,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 @@ -1460,14 +1456,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-O0-NEXT: s_mov_b32 s33, s26 +; WAVE32-O0-NEXT: s_mov_b32 s33, s25 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-O0-NEXT: s_mov_b32 s28, s33 +; WAVE64-O0-NEXT: s_mov_b32 s19, s33 ; WAVE64-O0-NEXT: s_mov_b32 s33, s32 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1560,7 +1556,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload @@ -1580,14 +1575,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 -; WAVE64-O0-NEXT: s_mov_b32 s33, s28 +; WAVE64-O0-NEXT: s_mov_b32 s33, s19 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s25, s33 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1677,7 +1672,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0 @@ -1693,7 +1687,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s25 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index d2364a6..bfc249e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -233,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -249,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -286,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -371,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -407,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 37f207f..4939d526 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -47,7 +47,6 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_mov_b32 s15, 42 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 3a33194..7eabe98 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -101,7 +101,6 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 11f6a29..e79cb66 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -406,7 +406,6 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 @@ -633,7 +632,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index e5cebc1..def51f2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -413,7 +413,6 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -657,7 +656,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1285,7 +1283,6 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1529,7 +1526,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -- cgit v1.1 From cd6434f9ec9af8d7508dc53806440297a24292cf Mon Sep 17 00:00:00 2001 From: Bevin Hansson <59652494+bevin-hansson@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:37:36 +0200 Subject: [ExpandLargeDivRem] Scalarize vector types. (#86959) expand-large-divrem cannot handle vector types. If overly large vector element types survive into isel, they will likely be scalarized there, but since isel cannot handle scalar integer types of that size, it will assert. Handle vector types in expand-large-divrem by scalarizing them and then expanding the scalar type operation. For large vectors, this results in a *massive* code expansion, but it's better than asserting. --- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 3233 +++++++++++++++++++++++++++++++- 1 file changed, 3228 insertions(+), 5 deletions(-) (limited to 'llvm/test/CodeGen/AMDGPU') diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 46e2632..16a03ba 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1,25 +1,3248 @@ -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s - -; SDAG-ERR: LLVM ERROR: unsupported libcall legalization -; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_sdiv_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v26, v24 +; SDAG-NEXT: v_mov_b32_e32 v27, v25 +; SDAG-NEXT: v_xor_b32_e32 v17, v24, v3 +; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1 +; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0 +; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11 +; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10 +; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9 +; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v17, v10 +; SDAG-NEXT: v_min_u32_e32 v18, v1, v18 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v11 +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17 +; SDAG-NEXT: v_ffbh_u32_e32 v17, v11 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v17 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc +; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v29 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v28, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v0 +; SDAG-NEXT: v_min_u32_e32 v20, v9, v20 +; SDAG-NEXT: v_or_b32_e32 v9, v29, v1 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20 +; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_min_u32_e32 v8, v19, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v17, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v9, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8 +; SDAG-NEXT: v_or_b32_e32 v19, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v49 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v48 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 +; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2 +; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v28 +; SDAG-NEXT: v_and_b32_e32 v48, v38, v29 +; SDAG-NEXT: v_and_b32_e32 v49, v38, v0 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v38 +; SDAG-NEXT: v_and_b32_e32 v38, v38, v1 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v23, v17 +; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB0_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; SDAG-NEXT: v_or_b32_e32 v20, v19, v1 +; SDAG-NEXT: v_or_b32_e32 v21, v17, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v18, v0 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v2 +; SDAG-NEXT: .LBB0_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v22, v18 +; SDAG-NEXT: v_mov_b32_e32 v23, v19 +; SDAG-NEXT: v_xor_b32_e32 v0, v18, v7 +; SDAG-NEXT: v_xor_b32_e32 v1, v18, v6 +; SDAG-NEXT: v_xor_b32_e32 v3, v18, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v18, v4 +; SDAG-NEXT: v_xor_b32_e32 v6, v19, v15 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v14 +; SDAG-NEXT: v_xor_b32_e32 v8, v19, v13 +; SDAG-NEXT: v_xor_b32_e32 v10, v19, v12 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v18, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v5, v2 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v1, v18, vcc +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v3 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v4 +; SDAG-NEXT: v_min_u32_e32 v11, v1, v11 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v10, v19 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v5 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v8, v19, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 +; SDAG-NEXT: v_min_u32_e32 v8, v10, v12 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v13, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v7, v19, vcc +; SDAG-NEXT: v_add_i32_e64 v7, s[8:9], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v29 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v6, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v6, v28, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 +; SDAG-NEXT: v_min_u32_e32 v12, v7, v12 +; SDAG-NEXT: v_or_b32_e32 v7, v29, v1 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v13, v1 +; SDAG-NEXT: v_add_i32_e32 v12, vcc, 64, v12 +; SDAG-NEXT: v_addc_u32_e64 v14, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_min_u32_e32 v6, v11, v13 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v14, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v7, v9 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc +; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 +; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 +; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v14, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 +; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: .LBB0_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 +; SDAG-NEXT: v_or_b32_e32 v9, v13, v9 +; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 +; SDAG-NEXT: v_or_b32_e32 v8, v12, v8 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v15 +; SDAG-NEXT: v_and_b32_e32 v38, v15, v1 +; SDAG-NEXT: v_and_b32_e32 v39, v15, v0 +; SDAG-NEXT: v_and_b32_e32 v48, v15, v29 +; SDAG-NEXT: v_and_b32_e32 v15, v15, v28 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: v_mov_b32_e32 v15, v11 +; SDAG-NEXT: v_mov_b32_e32 v14, v10 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB0_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 +; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 +; SDAG-NEXT: .LBB0_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 +; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 +; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22 +; SDAG-NEXT: v_xor_b32_e32 v6, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 +; SDAG-NEXT: v_xor_b32_e32 v5, v17, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v16, v2 +; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 +; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc +; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sdiv_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 +; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 +; GISEL-NEXT: v_xor_b32_e32 v3, v24, v3 +; GISEL-NEXT: v_xor_b32_e32 v8, v25, v8 +; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9 +; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10 +; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc +; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25 +; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v8, v27 +; GISEL-NEXT: v_ffbh_u32_e32 v9, v26 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v26, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v27, v11 +; GISEL-NEXT: v_or_b32_e32 v2, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v3, v17, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v19 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v8, v9 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v28, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v30, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v2 +; GISEL-NEXT: v_or_b32_e32 v9, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v9, v22, v20 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v32 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 +; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28 +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22 +; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34 +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v1, v3, v23 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 +; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v36, v23 +; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 +; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 +; GISEL-NEXT: v_and_b32_e32 v18, v0, v27 +; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_cbranch_execnz .LBB0_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: .LBB0_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5 +; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 +; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 +; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 +; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc +; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 +; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 +; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 +; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 +; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v26, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v28, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 +; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 +; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 +; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc +; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 +; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB0_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v14 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v3, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v1, v27, v29 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v32, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v33, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mov_b32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB0_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 +; GISEL-NEXT: .LBB0_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 +; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 +; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 +; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3 +; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 +; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 +; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 +; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v7, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v8, v7, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = sdiv <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_udiv_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 +; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 +; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 +; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] +; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 +; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 +; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 +; SDAG-NEXT: s_mov_b64 s[12:13], 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v26, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 +; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 +; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v26, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 +; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 +; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 +; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 +; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 +; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc +; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 +; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] +; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 +; SDAG-NEXT: v_mov_b32_e32 v26, v22 +; SDAG-NEXT: v_mov_b32_e32 v25, v21 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_cbranch_execnz .LBB1_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 +; SDAG-NEXT: .LBB1_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 +; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 +; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v2, v4, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v14 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v10, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v13 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v7 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8 +; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10 +; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v0, v0, v9 +; SDAG-NEXT: v_min_u32_e32 v1, v1, v11 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v21 +; SDAG-NEXT: v_min_u32_e32 v3, v3, v23 +; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1 +; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v1, v3 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v8, v24 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0 +; SDAG-NEXT: v_or_b32_e32 v2, v11, v25 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2] +; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 +; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 +; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v21, v32 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: .LBB1_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v20 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v30 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v3, v10, v3 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v2, v9, v2 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v23 +; SDAG-NEXT: v_and_b32_e32 v30, v23, v15 +; SDAG-NEXT: v_and_b32_e32 v31, v23, v14 +; SDAG-NEXT: v_and_b32_e32 v32, v23, v13 +; SDAG-NEXT: v_and_b32_e32 v23, v23, v12 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v31, v11, v25 +; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_mov_b32_e32 v23, v21 +; SDAG-NEXT: v_mov_b32_e32 v22, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB1_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB1_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v4 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v3 +; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 +; SDAG-NEXT: .LBB1_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v19 +; SDAG-NEXT: v_mov_b32_e32 v1, v18 +; SDAG-NEXT: v_mov_b32_e32 v2, v17 +; SDAG-NEXT: v_mov_b32_e32 v3, v16 +; SDAG-NEXT: v_mov_b32_e32 v4, v11 +; SDAG-NEXT: v_mov_b32_e32 v5, v10 +; SDAG-NEXT: v_mov_b32_e32 v6, v9 +; SDAG-NEXT: v_mov_b32_e32 v7, v8 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_udiv_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v16, v2 +; GISEL-NEXT: v_mov_b32_e32 v17, v3 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v20, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v16 +; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v2, v20, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v22, v3 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, v2, v3 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v20 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v21, v23 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v26, v18 +; GISEL-NEXT: v_and_b32_e32 v18, 1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v22, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc +; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v3, v21, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v21, s11 +; GISEL-NEXT: v_mov_b32_e32 v20, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc +; GISEL-NEXT: v_lshl_b64 v[24:25], v[16:17], v24 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[16:17], v32 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_or_b32_e32 v21, v21, v25 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v20, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v21, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v21, s7 +; GISEL-NEXT: v_mov_b32_e32 v20, s6 +; GISEL-NEXT: v_mov_b32_e32 v19, s5 +; GISEL-NEXT: v_mov_b32_e32 v18, s4 +; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v35, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GISEL-NEXT: v_or_b32_e32 v22, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v23, v19, v21 +; GISEL-NEXT: v_or_b32_e32 v16, v16, v0 +; GISEL-NEXT: v_or_b32_e32 v20, v24, v35 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v20 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v25, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v19, v27, v29 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v18, v0, v8 +; GISEL-NEXT: v_and_b32_e32 v19, v0, v9 +; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB1_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v1 +; GISEL-NEXT: .LBB1_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v0, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v13, v15 +; GISEL-NEXT: v_or_b32_e32 v8, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v9, v5, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v12 +; GISEL-NEXT: v_ffbh_u32_e32 v20, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v14 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v4 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v6 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v0, v16, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v20, v1 +; GISEL-NEXT: v_min_u32_e32 v8, v22, v8 +; GISEL-NEXT: v_min_u32_e32 v9, v24, v9 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 +; GISEL-NEXT: v_or_b32_e32 v9, v1, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v9, v20, v10 +; GISEL-NEXT: v_and_b32_e32 v10, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v20, 1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc +; GISEL-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v26 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v23, s11 +; GISEL-NEXT: v_mov_b32_e32 v22, s10 +; GISEL-NEXT: v_mov_b32_e32 v21, s9 +; GISEL-NEXT: v_mov_b32_e32 v20, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22 +; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 +; GISEL-NEXT: v_or_b32_e32 v21, v21, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v23, s7 +; GISEL-NEXT: v_mov_b32_e32 v22, s6 +; GISEL-NEXT: v_mov_b32_e32 v21, s5 +; GISEL-NEXT: v_mov_b32_e32 v20, s4 +; GISEL-NEXT: .LBB1_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1 +; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v16, v4 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v30 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v9, v20, v6 +; GISEL-NEXT: v_or_b32_e32 v10, v21, v7 +; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc +; GISEL-NEXT: v_or_b32_e32 v6, v8, v24 +; GISEL-NEXT: v_or_b32_e32 v7, v11, v25 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v4, 1, v6 +; GISEL-NEXT: v_and_b32_e32 v7, v6, v12 +; GISEL-NEXT: v_and_b32_e32 v30, v6, v13 +; GISEL-NEXT: v_and_b32_e32 v31, v6, v14 +; GISEL-NEXT: v_and_b32_e32 v32, v6, v15 +; GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB1_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v10, v20, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v21, v5 +; GISEL-NEXT: .LBB1_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v0, v18 +; GISEL-NEXT: v_mov_b32_e32 v1, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v10 +; GISEL-NEXT: v_mov_b32_e32 v5, v11 +; GISEL-NEXT: v_mov_b32_e32 v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v7, v9 +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = udiv <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_srem_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v29, v28 +; SDAG-NEXT: v_xor_b32_e32 v18, v3, v28 +; SDAG-NEXT: v_xor_b32_e32 v19, v2, v28 +; SDAG-NEXT: v_xor_b32_e32 v1, v1, v28 +; SDAG-NEXT: v_xor_b32_e32 v0, v0, v28 +; SDAG-NEXT: v_xor_b32_e32 v11, v11, v16 +; SDAG-NEXT: v_xor_b32_e32 v10, v10, v16 +; SDAG-NEXT: v_xor_b32_e32 v20, v9, v16 +; SDAG-NEXT: v_xor_b32_e32 v9, v8, v16 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v28 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v28, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v19, v28, vcc +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v2, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v0 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v21 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, v9, v16 +; SDAG-NEXT: v_or_b32_e32 v9, v3, v1 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v30, vcc, v20, v16, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v10, v16, vcc +; SDAG-NEXT: v_add_i32_e64 v21, s[8:9], 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v31, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v16, v8 +; SDAG-NEXT: v_min_u32_e32 v19, v21, v22 +; SDAG-NEXT: v_or_b32_e32 v11, v30, v9 +; SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v9 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_min_u32_e32 v10, v16, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v18 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v10 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v11, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 +; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v24 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v25 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v32 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 +; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 +; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 +; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v27 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v26 +; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v8, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v49, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v48, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v9, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v17, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v16, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v25 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v16 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v49 +; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v36, v24 +; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v38, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v48, v16, v31 +; SDAG-NEXT: v_and_b32_e32 v49, v16, v30 +; SDAG-NEXT: v_and_b32_e32 v50, v16, v8 +; SDAG-NEXT: v_and_b32_e32 v51, v16, v9 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; SDAG-NEXT: v_or_b32_e32 v48, v32, v34 +; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v23, v17 +; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB2_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 +; SDAG-NEXT: v_or_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v27, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v33, v16, v20 +; SDAG-NEXT: .LBB2_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v34, v26 +; SDAG-NEXT: v_xor_b32_e32 v10, v7, v26 +; SDAG-NEXT: v_xor_b32_e32 v11, v6, v26 +; SDAG-NEXT: v_xor_b32_e32 v5, v5, v26 +; SDAG-NEXT: v_xor_b32_e32 v4, v4, v26 +; SDAG-NEXT: v_xor_b32_e32 v15, v15, v16 +; SDAG-NEXT: v_xor_b32_e32 v14, v14, v16 +; SDAG-NEXT: v_xor_b32_e32 v13, v13, v16 +; SDAG-NEXT: v_xor_b32_e32 v12, v12, v16 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v26 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v5, v26, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v5, v6 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v11, v26, vcc +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v7 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v10, v26, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v6, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v4 +; SDAG-NEXT: v_min_u32_e32 v18, v11, v18 +; SDAG-NEXT: v_sub_i32_e32 v37, vcc, v12, v16 +; SDAG-NEXT: v_or_b32_e32 v11, v7, v5 +; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], 32, v19 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v5 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v36, vcc, v13, v16, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_ffbh_u32_e32 v11, v37 +; SDAG-NEXT: v_min_u32_e32 v12, v12, v19 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v20, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v14, v16, vcc +; SDAG-NEXT: v_add_i32_e64 v13, s[8:9], 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v14, v36 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v15, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v12, v37, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v15, v10 +; SDAG-NEXT: v_min_u32_e32 v14, v13, v14 +; SDAG-NEXT: v_or_b32_e32 v13, v36, v11 +; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v16, v11 +; SDAG-NEXT: v_add_i32_e32 v14, vcc, 64, v14 +; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; SDAG-NEXT: v_min_u32_e32 v12, v15, v16 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v20, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 +; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v12 +; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v17, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v14 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v7, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v6, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc +; SDAG-NEXT: v_lshl_b64 v[18:19], v[6:7], v18 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc +; SDAG-NEXT: v_or_b32_e32 v13, v38, v48 +; SDAG-NEXT: v_sub_i32_e32 v15, vcc, 0x7f, v12 +; SDAG-NEXT: v_or_b32_e32 v14, v39, v49 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v15 +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v15 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v15 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[13:14] +; SDAG-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 +; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 +; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v13, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[16:17], v[6:7], v38 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 +; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 +; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 +; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v24 +; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 +; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v25 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v54, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v53, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; SDAG-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB2_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v16 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v55 +; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 +; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 +; SDAG-NEXT: v_or_b32_e32 v14, v18, v14 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v52, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v21 +; SDAG-NEXT: v_and_b32_e32 v54, v21, v11 +; SDAG-NEXT: v_and_b32_e32 v55, v21, v10 +; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 +; SDAG-NEXT: v_and_b32_e32 v21, v21, v37 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v40, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v24, v55, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v54, vcc +; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v38 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v48, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v49, vcc +; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 +; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: v_mov_b32_e32 v21, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB2_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v15 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v13 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v12 +; SDAG-NEXT: .LBB2_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v14, v33, v9 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0 +; SDAG-NEXT: v_mul_lo_u32 v24, v27, v8 +; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31 +; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v16, v11 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v39, v17, v10 +; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 +; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0 +; SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; SDAG-NEXT: v_mov_b32_e32 v14, v9 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v21, v38 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v13, v24 +; SDAG-NEXT: v_mov_b32_e32 v24, v23 +; SDAG-NEXT: v_mov_b32_e32 v23, v15 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23] +; SDAG-NEXT: v_xor_b32_e32 v33, v2, v28 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v14, v39 +; SDAG-NEXT: v_mov_b32_e32 v14, v11 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v24, v2 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v3, v2, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] +; SDAG-NEXT: v_mov_b32_e32 v18, v23 +; SDAG-NEXT: v_mov_b32_e32 v23, v15 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23] +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v12 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v29 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v18, v9 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, v8 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v35, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v2 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, v0, v28 +; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v33, v28 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v16, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v8, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v34, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v34, vcc +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_srem_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v19, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20 +; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc +; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20 +; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v20, v29 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v30, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v29, v11 +; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 +; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v8 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v26, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v2 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v19, v20, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v18, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v19, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v35, vcc, -1, v30 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v29, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[8:9], v22 +; GISEL-NEXT: v_lshr_b64 v[24:25], v[8:9], v24 +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc +; GISEL-NEXT: v_mov_b32_e32 v23, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 +; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v3, v48, v25 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v31, v33 +; GISEL-NEXT: v_or_b32_e32 v1, v32, v34 +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v1, v0, v30 +; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 +; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB2_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v32, v1, v3 +; GISEL-NEXT: .LBB2_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15 +; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_xor_b32_e32 v1, v4, v33 +; GISEL-NEXT: v_xor_b32_e32 v4, v5, v33 +; GISEL-NEXT: v_xor_b32_e32 v5, v6, v33 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v33 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v20, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v14, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v15, v15, v0 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v1, v33 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v4, v33, vcc +; GISEL-NEXT: v_sub_i32_e64 v35, s[4:5], v6, v0 +; GISEL-NEXT: v_subb_u32_e64 v34, s[4:5], v20, v0, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v5, v33, vcc +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v33, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v0, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v0, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v20, v34 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v35 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v35, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v34, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v12, v6 +; GISEL-NEXT: v_or_b32_e32 v15, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v4 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v14, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 64, v14 +; GISEL-NEXT: v_min_u32_e32 v15, v26, v15 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_or_b32_e32 v3, v1, v15 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v20, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v14, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v15, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v14, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v15, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v48, vcc, -1, v35 +; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v34, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22 +; GISEL-NEXT: v_lshr_b64 v[24:25], v[6:7], v24 +; GISEL-NEXT: v_addc_u32_e32 v50, vcc, -1, v4, vcc +; GISEL-NEXT: v_addc_u32_e32 v51, vcc, -1, v5, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v36 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v13, vcc +; GISEL-NEXT: v_mov_b32_e32 v23, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB2_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 +; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15 +; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36 +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v3, v52, v25 +; GISEL-NEXT: v_or_b32_e32 v14, v14, v22 +; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v53, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v36, v38 +; GISEL-NEXT: v_or_b32_e32 v1, v37, v39 +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v1, v0, v35 +; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 +; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB2_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v22 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v23 +; GISEL-NEXT: .LBB2_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 +; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] +; GISEL-NEXT: v_mov_b32_e32 v22, v19 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mov_b32_e32 v23, v14 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] +; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 +; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = srem <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_urem_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc +; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_or_b32_e32 v28, v28, v18 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v38 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v39 +; SDAG-NEXT: v_or_b32_e32 v17, v21, v17 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26 +; SDAG-NEXT: v_or_b32_e32 v16, v20, v16 +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v8 +; SDAG-NEXT: v_and_b32_e32 v48, v38, v9 +; SDAG-NEXT: v_and_b32_e32 v49, v38, v10 +; SDAG-NEXT: v_and_b32_e32 v18, 1, v38 +; SDAG-NEXT: v_and_b32_e32 v38, v38, v11 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB3_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB3_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v33, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 +; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 +; SDAG-NEXT: .LBB3_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 +; SDAG-NEXT: v_or_b32_e32 v16, v12, v14 +; SDAG-NEXT: v_or_b32_e32 v19, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v18, v4, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v14 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v13 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v7 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v5 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v17, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_and_b32_e32 v20, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc +; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] +; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 +; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 +; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 +; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 +; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v21, v29 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v28 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: .LBB3_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v28, v28, v20 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v25 +; SDAG-NEXT: v_and_b32_e32 v50, v25, v15 +; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 +; SDAG-NEXT: v_and_b32_e32 v52, v25, v13 +; SDAG-NEXT: v_and_b32_e32 v25, v25, v12 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 +; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_mov_b32_e32 v25, v21 +; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB3_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB3_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 +; SDAG-NEXT: .LBB3_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14 +; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; SDAG-NEXT: v_mov_b32_e32 v18, v11 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 +; SDAG-NEXT: v_mov_b32_e32 v28, v27 +; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35 +; SDAG-NEXT: v_mov_b32_e32 v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19] +; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17] +; SDAG-NEXT: v_mov_b32_e32 v8, v11 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8 +; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v8, v10 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] +; SDAG-NEXT: v_mov_b32_e32 v22, v27 +; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18] +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v11, v20 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc +; SDAG-NEXT: v_mov_b32_e32 v10, v19 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_urem_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v2 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 +; GISEL-NEXT: v_min_u32_e32 v17, v24, v17 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v21, v17, v19 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v21, v22, v23 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v21 +; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v18, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc +; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v26 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v17, v21, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc +; GISEL-NEXT: v_lshl_b64 v[24:25], v[2:3], v24 +; GISEL-NEXT: v_lshr_b64 v[26:27], v[2:3], v26 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v24 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v25 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_mov_b32_e32 v19, s7 +; GISEL-NEXT: v_mov_b32_e32 v18, s6 +; GISEL-NEXT: v_mov_b32_e32 v17, s5 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1 +; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27 +; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_or_b32_e32 v22, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v23, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v28, v26 +; GISEL-NEXT: v_or_b32_e32 v19, v38, v27 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v30, v32 +; GISEL-NEXT: v_or_b32_e32 v17, v31, v33 +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 +; GISEL-NEXT: v_and_b32_e32 v17, v16, v8 +; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 +; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc +; GISEL-NEXT: v_mov_b32_e32 v16, v24 +; GISEL-NEXT: v_mov_b32_e32 v17, v25 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB3_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 +; GISEL-NEXT: v_or_b32_e32 v32, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v33, v17, v19 +; GISEL-NEXT: .LBB3_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v16, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v17, v13, v15 +; GISEL-NEXT: v_or_b32_e32 v18, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v19, v5, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v12 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v14 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v4 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v6 +; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v31 +; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 +; GISEL-NEXT: v_min_u32_e32 v17, v26, v17 +; GISEL-NEXT: v_min_u32_e32 v18, v28, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v30, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v19, v17, v23 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v19, v26, v24 +; GISEL-NEXT: v_and_b32_e32 v24, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v26, 1, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16 +; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v22, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc +; GISEL-NEXT: v_subrev_i32_e64 v24, s[4:5], 64, v28 +; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v22, v18 +; GISEL-NEXT: v_or_b32_e32 v17, v23, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v38, vcc, -1, v12 +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc +; GISEL-NEXT: v_lshl_b64 v[26:27], v[6:7], v26 +; GISEL-NEXT: v_lshr_b64 v[28:29], v[6:7], v28 +; GISEL-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v26 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v27 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v34 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v28, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v29, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v30, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v27, 0 +; GISEL-NEXT: v_mov_b32_e32 v19, s7 +; GISEL-NEXT: v_mov_b32_e32 v18, s6 +; GISEL-NEXT: v_mov_b32_e32 v17, s5 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .LBB3_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1 +; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29 +; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; GISEL-NEXT: v_or_b32_e32 v24, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v25, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v30, v28 +; GISEL-NEXT: v_or_b32_e32 v19, v50, v29 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v26 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v19 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v51, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v34, v36 +; GISEL-NEXT: v_or_b32_e32 v17, v35, v37 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 +; GISEL-NEXT: v_and_b32_e32 v17, v16, v12 +; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 +; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 +; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc +; GISEL-NEXT: v_mov_b32_e32 v16, v26 +; GISEL-NEXT: v_mov_b32_e32 v17, v27 +; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc +; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB3_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v24, v16, v26 +; GISEL-NEXT: v_or_b32_e32 v25, v17, v27 +; GISEL-NEXT: .LBB3_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 +; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 +; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 +; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] +; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl } -- cgit v1.1