aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll72
1 files changed, 31 insertions, 41 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 672c93b..66c49ba 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7
; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1
-; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2
; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7
+; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
+; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7
; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1
; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8
; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2
-; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8
; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2
; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2
-; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6
+; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2
; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v16
; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2
; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1
+; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2
-; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc
@@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
-; GFX10-NEXT: s_mov_b32 s2, 0xff800000
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
@@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9
; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4
; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX10-NEXT: v_and_or_b32 v8, v7, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7
; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_and_or_b32 v12, v0, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX10-NEXT: v_and_or_b32 v16, v1, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_and_or_b32 v14, v11, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX10-NEXT: v_and_or_b32 v8, v4, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT: v_and_or_b32 v3, v0, s2, 0x400000
-; GFX10-NEXT: v_and_or_b32 v10, v1, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_and_or_b32 v12, v7, s2, 0x400000
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
@@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0xff800000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
@@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_and_or_b32 v14, v11, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT: v_and_or_b32 v16, v1, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1
; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
@@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4
; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_and_or_b32 v8, v7, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
@@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_and_or_b32 v12, v7, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT: v_and_or_b32 v10, v1, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000
+; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff