diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | 72 |
1 files changed, 31 insertions, 41 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 672c93b..66c49ba 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7 ; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1 -; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2 ; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7 +; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1 ; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8 ; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 ; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2 ; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc @@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] @@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 ; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4 ; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX10-NEXT: v_and_or_b32 v8, v7, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX10-NEXT: v_and_or_b32 v12, v0, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff ; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff ; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX10-NEXT: v_and_or_b32 v16, v1, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: v_and_or_b32 v14, v11, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo -; GFX10-NEXT: v_and_or_b32 v8, v4, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, 0x400000 -; GFX10-NEXT: v_and_or_b32 v10, v1, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff ; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_and_or_b32 v12, v7, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo @@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX11-NEXT: v_and_or_b32 v14, v11, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX11-NEXT: v_and_or_b32 v16, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff @@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX11-NEXT: v_and_or_b32 v8, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 ; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo @@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_and_or_b32 v12, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10 -; GFX11-NEXT: v_and_or_b32 v10, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff |