; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @sub_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX9-LABEL: sub_var_var_i1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_var_var_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sub_var_var_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %sub = sub i1 %a, %b store i1 %sub, ptr addrspace(1) %out ret void } define amdgpu_kernel void @sub_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-LABEL: sub_var_imm_i1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_var_imm_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: s_xor_b32 s2, vcc_lo, -1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sub_var_imm_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: s_xor_b32 s2, vcc_lo, -1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in %sub = sub i1 %a, 1 store i1 %sub, ptr addrspace(1) %out ret void } define amdgpu_kernel void @sub_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX9-LABEL: sub_i1_cf: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: .LBB2_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; %if ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_andn2_b64 s[2:3], s[4:5], exec ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX9-NEXT: .LBB2_4: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i1_cf: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 ; GFX10-NEXT: ; implicit-def: $sgpr4 ; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execz .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 ; GFX10-NEXT: .LBB2_2: ; %Flow ; GFX10-NEXT: s_andn2_saveexec_b32 s5, s5 ; GFX10-NEXT: s_cbranch_execz .LBB2_4 ; GFX10-NEXT: ; %bb.3: ; %if ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 s2, s4, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo ; GFX10-NEXT: s_or_b32 s4, s2, s3 ; GFX10-NEXT: .LBB2_4: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b32 s2, s4, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sub_i1_cf: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX11-NEXT: s_xor_b32 s7, exec_lo, s7 ; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 1, v0 ; GFX11-NEXT: .LBB2_2: ; %Flow ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_not1_saveexec_b32 s4, s7 ; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_and_not1_b32 s2, s6, exec_lo ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo ; GFX11-NEXT: s_or_b32 s6, s2, s3 ; GFX11-NEXT: .LBB2_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_xor_b32 s2, s6, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 br i1 %d_cmp, label %if, label %else if: %0 = load volatile i1, ptr addrspace(1) %a br label %endif else: %1 = load volatile i1, ptr addrspace(1) %b br label %endif endif: %2 = phi i1 [%0, %if], [%1, %else] %3 = sub i1 %2, -1 store i1 %3, ptr addrspace(1) %out ret void } declare i32 @llvm.amdgcn.workitem.id.x()