; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s ; if instruction is uniform and there is available instruction, select SALU instruction define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: uniform_in_vgpr: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 ; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: uniform_in_vgpr: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 ; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 ; NEW_RBS-NEXT: s_add_i32 s0, s0, s1 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %a.i32 = fptoui float %a to i32 %res = add i32 %a.i32, %b store i32 %res, ptr addrspace(1) %ptr ret void } ; copy sgpr to vgpr + readfirstlane vgpr to sgpr combine from rb-legalize define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: back_to_back_uniform_in_vgpr: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_add_f32_e64 v2, s0, s1 ; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 ; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: back_to_back_uniform_in_vgpr: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_add_f32_e64 v2, s0, s1 ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 ; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 ; NEW_RBS-NEXT: s_add_i32 s0, s0, s2 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %add = fadd float %a, %b %add.i32 = fptoui float %add to i32 %res = add i32 %add.i32, %c store i32 %res, ptr addrspace(1) %ptr ret void } ; fast rules for vector instructions define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %voffset, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: buffer_load_uniform: ; OLD_RBS: ; %bb.0: ; %.entry ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s4 ; OLD_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen ; OLD_RBS-NEXT: s_waitcnt vmcnt(0) ; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: buffer_load_uniform: ; NEW_RBS: ; %bb.0: ; %.entry ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s4 ; NEW_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v3 ; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm .entry: %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) %el1 = extractelement <4 x i32> %vec, i64 1 %res = add i32 %el1, 1 store i32 %res, ptr addrspace(1) %ptr ret void } define amdgpu_cs void @buffer_load_divergent(<4 x i32> inreg %rsrc, i32 %voffset, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: buffer_load_divergent: ; OLD_RBS: ; %bb.0: ; %.entry ; OLD_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen ; OLD_RBS-NEXT: s_waitcnt vmcnt(0) ; OLD_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4 ; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: buffer_load_divergent: ; NEW_RBS: ; %bb.0: ; %.entry ; NEW_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm .entry: %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) %el1 = extractelement <4 x i32> %vec, i64 1 %res = add i32 %el1, 1 store i32 %res, ptr addrspace(1) %ptr ret void } ;lowering in rb-legalize (sgpr S64 is legal, vgpr has to be split to S32) define amdgpu_ps void @vgpr_and_i64(i64 %a, i64 %b, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: vgpr_and_i64: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_and_b32_e32 v0, v0, v2 ; OLD_RBS-NEXT: v_and_b32_e32 v1, v1, v3 ; OLD_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: vgpr_and_i64: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_and_b32_e32 v0, v0, v2 ; NEW_RBS-NEXT: v_and_b32_e32 v1, v1, v3 ; NEW_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; NEW_RBS-NEXT: s_endpgm %res = and i64 %a, %b store i64 %res, ptr addrspace(1) %ptr ret void } ; It is up to user instruction to deal with potential truncated bits in reg. ; Here G_ABS needs to sign extend S16 in reg to S32 and then do S32 G_ABS. define amdgpu_ps void @abs_sgpr_i16(i16 inreg %arg, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: abs_sgpr_i16: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: s_sext_i32_i16 s0, s0 ; OLD_RBS-NEXT: s_abs_i32 s0, s0 ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 ; OLD_RBS-NEXT: global_store_short v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: abs_sgpr_i16: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_sext_i32_i16 s0, s0 ; NEW_RBS-NEXT: s_abs_i32 s0, s0 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_short v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) store i16 %res, ptr addrspace(1) %ptr ret void } define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 inreg %cond) { ; OLD_RBS-LABEL: uniform_i1_phi: ; OLD_RBS: ; %bb.0: ; %A ; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 6 ; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; OLD_RBS-NEXT: s_cmp_lg_u32 s1, 0 ; OLD_RBS-NEXT: s_cbranch_scc1 .LBB6_2 ; OLD_RBS-NEXT: ; %bb.1: ; %B ; OLD_RBS-NEXT: s_cmp_lt_u32 s0, 1 ; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; OLD_RBS-NEXT: .LBB6_2: ; %exit ; OLD_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000 ; OLD_RBS-NEXT: s_add_i32 s0, s0, 2 ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: uniform_i1_phi: ; NEW_RBS: ; %bb.0: ; %A ; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 6 ; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; NEW_RBS-NEXT: s_cmp_lg_u32 s1, 0 ; NEW_RBS-NEXT: s_cbranch_scc1 .LBB6_2 ; NEW_RBS-NEXT: ; %bb.1: ; %B ; NEW_RBS-NEXT: s_cmp_lt_u32 s0, 1 ; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; NEW_RBS-NEXT: .LBB6_2: ; %exit ; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: s_add_i32 s0, s0, 2 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm A: %val_A = icmp uge i32 %tid, 6 %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %B, label %exit B: %val_B = icmp ult i32 %tid, 1 br label %exit exit: %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] %sel = select i1 %phi, i32 1, i32 2 store i32 %sel, ptr addrspace(1) %out ret void } ; this is kind of i1 readfirstlane ; uniform i1 result on instruction that is only available on VALU define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: vcc_to_scc: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s2 ; OLD_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 ; OLD_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: vcc_to_scc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 ; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 ; NEW_RBS-NEXT: s_and_b32 s0, s0, 1 ; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %vcc_to_scc = fcmp oeq float %a, 0.0 %select = select i1 %vcc_to_scc, i32 %b, i32 %c store i32 %select, ptr addrspace(1) %ptr ret void } ; combiner in rb-legalize recognizes sgpr S1 to vcc copy define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: scc_to_vcc: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 0 ; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 ; OLD_RBS-NEXT: s_and_b32 s0, 1, s0 ; OLD_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: scc_to_vcc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 0 ; NEW_RBS-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 ; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off ; NEW_RBS-NEXT: s_endpgm %scc_to_vcc = icmp eq i32 %a, 0 %select = select i1 %scc_to_vcc, i32 %b, i32 %c store i32 %select, ptr addrspace(1) %ptr ret void } ; this is only G_TRUNC that is not no-op in global-isel for AMDGPU define amdgpu_ps void @vgpr_to_vcc_trunc(i32 %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: vgpr_to_vcc_trunc: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_and_b32_e32 v0, 1, v0 ; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; OLD_RBS-NEXT: global_store_dword v[3:4], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: vgpr_to_vcc_trunc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_and_b32_e32 v0, 1, v0 ; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; NEW_RBS-NEXT: global_store_dword v[3:4], v0, off ; NEW_RBS-NEXT: s_endpgm %vcc = trunc i32 %a to i1 %select = select i1 %vcc, i32 %b, i32 %c store i32 %select, ptr addrspace(1) %ptr ret void } ; i1 input to zext and sext is something that survived legalizer (not trunc) ; lower to select define amdgpu_ps void @zext(i32 inreg %a, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: zext: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10 ; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: zext: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 ; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %bool = icmp eq i32 %a, 10 %zext = zext i1 %bool to i32 store i32 %zext, ptr addrspace(1) %ptr ret void } define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: sext: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10 ; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 ; OLD_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000 ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: sext: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 ; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %bool = icmp eq i32 %a, 10 %sext = sext i1 %bool to i32 store i32 %sext, ptr addrspace(1) %ptr ret void } ; divergent i1 bitwise, i1 vcc. ; inst selected into s_and_b32 on wave32 or s_and_b64 on wave64. define amdgpu_ps void @and_i1_vcc(i32 %a, i32 %b, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: and_i1_vcc: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0 ; OLD_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1 ; OLD_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 ; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: and_i1_vcc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0 ; NEW_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1 ; NEW_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 ; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off ; NEW_RBS-NEXT: s_endpgm %cmp_a = icmp uge i32 %a, 10 %cmp_b = icmp uge i32 %b, 20 %cc = and i1 %cmp_a, %cmp_b %res = select i1 %cc, i32 %a, i32 %b store i32 %res, ptr addrspace(1) %ptr ret void } ; uniform i1 bitwise, i32 sgpr. inst selected into s_and_b32. define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { ; OLD_RBS-LABEL: and_i1_scc: ; OLD_RBS: ; %bb.0: ; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 10 ; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; OLD_RBS-NEXT: s_cmp_ge_u32 s1, 20 ; OLD_RBS-NEXT: s_cselect_b32 s3, 1, 0 ; OLD_RBS-NEXT: s_and_b32 s2, s2, s3 ; OLD_RBS-NEXT: s_cmp_lg_u32 s2, 0 ; OLD_RBS-NEXT: s_cselect_b32 s0, s0, s1 ; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 ; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: and_i1_scc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 10 ; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; NEW_RBS-NEXT: s_cmp_ge_u32 s1, 20 ; NEW_RBS-NEXT: s_cselect_b32 s3, 1, 0 ; NEW_RBS-NEXT: s_and_b32 s2, s2, s3 ; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, s0, s1 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %cmp_a = icmp uge i32 %a, 10 %cmp_b = icmp uge i32 %b, 20 %cc = and i1 %cmp_a, %cmp_b %res = select i1 %cc, i32 %a, i32 %b store i32 %res, ptr addrspace(1) %ptr ret void } ; old RBS selects sgpr phi because it had sgpr inputs. define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) { ; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs: ; OLD_RBS: ; %bb.0: ; %A ; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; OLD_RBS-NEXT: s_mov_b32 s0, 0 ; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo ; OLD_RBS-NEXT: ; %bb.1: ; %B ; OLD_RBS-NEXT: s_mov_b32 s0, 1 ; OLD_RBS-NEXT: ; %bb.2: ; %exit ; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; OLD_RBS-NEXT: v_mov_b32_e32 v0, s0 ; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs: ; NEW_RBS: ; %bb.0: ; %A ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 ; NEW_RBS-NEXT: s_and_saveexec_b32 s0, vcc_lo ; NEW_RBS-NEXT: ; %bb.1: ; %B ; NEW_RBS-NEXT: s_mov_b32 s1, 1 ; NEW_RBS-NEXT: v_mov_b32_e32 v0, s1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit ; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm A: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %B, label %exit B: br label %exit exit: %phi = phi i32 [ 0, %A ], [ 1, %B ] store i32 %phi, ptr addrspace(1) %out ret void } ; old RBS assigned vgpr to uniform phi (because one input had undetermined bank) ; and it propagated to mul, which was not wrong. ; new RBS assigns vgpr to destination of mul even though both inputs are sgpr. ; TODO: implement temporal divergence lowering define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, ptr addrspace(1) %addr) { ; OLD_RBS-LABEL: divergent_because_of_temporal_divergent_use: ; OLD_RBS: ; %bb.0: ; %entry ; OLD_RBS-NEXT: s_mov_b32 s0, -1 ; OLD_RBS-NEXT: v_mov_b32_e32 v3, s0 ; OLD_RBS-NEXT: s_mov_b32 s0, 0 ; OLD_RBS-NEXT: .LBB15_1: ; %loop ; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; OLD_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3 ; OLD_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3 ; OLD_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 ; OLD_RBS-NEXT: s_or_b32 s0, vcc_lo, s0 ; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; OLD_RBS-NEXT: s_cbranch_execnz .LBB15_1 ; OLD_RBS-NEXT: ; %bb.2: ; %exit ; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; OLD_RBS-NEXT: v_mul_lo_u32 v0, v3, 10 ; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use: ; NEW_RBS: ; %bb.0: ; %entry ; NEW_RBS-NEXT: s_mov_b32 s1, -1 ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: .LBB15_1: ; %loop ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; NEW_RBS-NEXT: s_add_i32 s1, s1, 1 ; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v3, s1 ; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 ; NEW_RBS-NEXT: v_mov_b32_e32 v3, s1 ; NEW_RBS-NEXT: s_or_b32 s0, vcc_lo, s0 ; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; NEW_RBS-NEXT: s_cbranch_execnz .LBB15_1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit ; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; NEW_RBS-NEXT: v_mul_lo_u32 v0, v3, 10 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm entry: br label %loop loop: %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ] %f.counter = uitofp i32 %counter to float %cond = fcmp ogt float %f.counter, %val %counter.plus.1 = add i32 %counter, 1 br i1 %cond, label %exit, label %loop exit: %ceilx10 = mul i32 %counter, 10 store i32 %ceilx10, ptr addrspace(1) %addr ret void } ; Variables that hande counter can be allocated to sgprs. define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; OLD_RBS-LABEL: loop_with_2breaks: ; OLD_RBS: ; %bb.0: ; %entry ; OLD_RBS-NEXT: s_mov_b32 s0, 0 ; OLD_RBS-NEXT: ; implicit-def: $sgpr1 ; OLD_RBS-NEXT: v_mov_b32_e32 v6, s0 ; OLD_RBS-NEXT: s_branch .LBB16_3 ; OLD_RBS-NEXT: .LBB16_1: ; %Flow3 ; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; OLD_RBS-NEXT: s_waitcnt_depctr 0xffe3 ; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo ; OLD_RBS-NEXT: s_and_b32 s3, exec_lo, s4 ; OLD_RBS-NEXT: s_or_b32 s1, s1, s3 ; OLD_RBS-NEXT: .LBB16_2: ; %Flow ; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, s1 ; OLD_RBS-NEXT: s_or_b32 s0, s2, s0 ; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; OLD_RBS-NEXT: s_cbranch_execz .LBB16_6 ; OLD_RBS-NEXT: .LBB16_3: ; %A ; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; OLD_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo ; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, -1 ; OLD_RBS-NEXT: s_or_b32 s1, s1, s2 ; OLD_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] ; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off ; OLD_RBS-NEXT: s_waitcnt vmcnt(0) ; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 ; OLD_RBS-NEXT: s_and_saveexec_b32 s2, vcc_lo ; OLD_RBS-NEXT: s_cbranch_execz .LBB16_2 ; OLD_RBS-NEXT: ; %bb.4: ; %B ; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 ; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo ; OLD_RBS-NEXT: s_mov_b32 s4, -1 ; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off ; OLD_RBS-NEXT: s_waitcnt vmcnt(0) ; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 ; OLD_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo ; OLD_RBS-NEXT: s_cbranch_execz .LBB16_1 ; OLD_RBS-NEXT: ; %bb.5: ; %loop.body ; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; OLD_RBS-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 ; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo ; OLD_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6 ; OLD_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 ; OLD_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo ; OLD_RBS-NEXT: global_load_dword v9, v[7:8], off ; OLD_RBS-NEXT: v_mov_b32_e32 v6, v10 ; OLD_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; OLD_RBS-NEXT: s_or_b32 s4, s4, s5 ; OLD_RBS-NEXT: s_waitcnt vmcnt(0) ; OLD_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; OLD_RBS-NEXT: global_store_dword v[7:8], v9, off ; OLD_RBS-NEXT: s_branch .LBB16_1 ; OLD_RBS-NEXT: .LBB16_6: ; %exit ; OLD_RBS-NEXT: s_endpgm ; ; NEW_RBS-LABEL: loop_with_2breaks: ; NEW_RBS: ; %bb.0: ; %entry ; NEW_RBS-NEXT: s_mov_b32 s4, 0 ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: ; implicit-def: $sgpr5 ; NEW_RBS-NEXT: s_branch .LBB16_3 ; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3 ; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; NEW_RBS-NEXT: s_andn2_b32 s2, s5, exec_lo ; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s6 ; NEW_RBS-NEXT: s_or_b32 s5, s2, s3 ; NEW_RBS-NEXT: .LBB16_2: ; %Flow ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; NEW_RBS-NEXT: s_and_b32 s1, exec_lo, s5 ; NEW_RBS-NEXT: s_or_b32 s4, s1, s4 ; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_6 ; NEW_RBS-NEXT: .LBB16_3: ; %A ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; NEW_RBS-NEXT: s_ashr_i32 s1, s0, 31 ; NEW_RBS-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; NEW_RBS-NEXT: s_andn2_b32 s1, s5, exec_lo ; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 ; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, exec_lo ; NEW_RBS-NEXT: s_or_b32 s5, s1, s5 ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo ; NEW_RBS-NEXT: global_load_dword v6, v[6:7], off ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2 ; NEW_RBS-NEXT: ; %bb.4: ; %B ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 ; NEW_RBS-NEXT: s_mov_b32 s6, exec_lo ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo ; NEW_RBS-NEXT: global_load_dword v6, v[6:7], off ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; NEW_RBS-NEXT: s_and_saveexec_b32 s7, vcc_lo ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 ; NEW_RBS-NEXT: ; %bb.5: ; %loop.body ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 ; NEW_RBS-NEXT: s_add_i32 s2, s0, 1 ; NEW_RBS-NEXT: s_cmpk_lt_u32 s0, 0x64 ; NEW_RBS-NEXT: s_cselect_b32 s0, exec_lo, 0 ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo ; NEW_RBS-NEXT: s_andn2_b32 s3, s6, exec_lo ; NEW_RBS-NEXT: s_and_b32 s0, exec_lo, s0 ; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 ; NEW_RBS-NEXT: global_load_dword v8, v[6:7], off ; NEW_RBS-NEXT: s_mov_b32 s0, s2 ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v8, 1, v8 ; NEW_RBS-NEXT: global_store_dword v[6:7], v8, off ; NEW_RBS-NEXT: s_branch .LBB16_1 ; NEW_RBS-NEXT: .LBB16_6: ; %exit ; NEW_RBS-NEXT: s_endpgm entry: br label %A A: %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter %a.val = load i32, ptr addrspace(1) %a.plus.counter %a.cond = icmp eq i32 %a.val, 0 br i1 %a.cond, label %exit, label %B B: %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter %b.val = load i32, ptr addrspace(1) %b.plus.counter %b.cond = icmp eq i32 %b.val, 0 br i1 %b.cond, label %exit, label %loop.body loop.body: %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter %x.val = load i32, ptr addrspace(1) %x.plus.counter %x.val.plus.1 = add i32 %x.val, 1 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter %counter.plus.1 = add i32 %counter, 1 %x.cond = icmp ult i32 %counter, 100 br i1 %x.cond, label %exit, label %A exit: ret void } declare i16 @llvm.abs.i16(i16, i1) declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)