diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/frem.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/frem.ll | 1358 |
1 files changed, 1358 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 35d178c..6f91222 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -17562,5 +17562,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } + +define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: frem_v2f64_const_zero_num: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const_zero_num: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const_zero_num: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const_zero_num: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4] +; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const_zero_num: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const_zero_num: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const_zero_num: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX1150-NEXT: v_mov_b32_e32 v3, s3 +; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const_zero_num: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] +; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] +; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 +; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s3 +; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 + %r1 = frem <2 x double> <double 0.0, double 0.0>, %r0 + store <2 x double> %r1, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: frem_v2f64_const_one_denum: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB15_2 +; SI-NEXT: ; %bb.1: ; %frem.else16 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB15_3 +; SI-NEXT: s_branch .LBB15_8 +; SI-NEXT: .LBB15_2: +; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB15_3: ; %frem.compute15 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] +; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v6 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s5, s3, -1 +; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26 +; SI-NEXT: s_cmp_lt_i32 s5, 27 +; SI-NEXT: s_cbranch_scc1 .LBB15_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; SI-NEXT: s_add_i32 s5, s3, 25 +; SI-NEXT: v_mov_b32_e32 v9, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: .LBB15_5: ; %frem.loop_body23 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_bfi_b32 v5, s4, v9, v8 +; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5] +; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3] +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6] +; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26 +; SI-NEXT: s_sub_i32 s5, s5, 26 +; SI-NEXT: s_cmp_gt_i32 s5, 26 +; SI-NEXT: s_cbranch_scc1 .LBB15_5 +; SI-NEXT: ; %bb.6: ; %Flow50 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; SI-NEXT: s_sub_i32 s2, s5, 25 +; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 +; SI-NEXT: v_bfi_b32 v7, s2, v6, v5 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7] +; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc +; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; SI-NEXT: .LBB15_8: +; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB15_10 +; SI-NEXT: ; %bb.9: ; %frem.else +; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB15_11 +; SI-NEXT: s_branch .LBB15_16 +; SI-NEXT: .LBB15_10: +; SI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB15_11: ; %frem.compute +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3] +; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v8 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s5, s3, -1 +; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26 +; SI-NEXT: s_cmp_lt_i32 s5, 27 +; SI-NEXT: s_cbranch_scc1 .LBB15_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; SI-NEXT: s_add_i32 s5, s3, 25 +; SI-NEXT: v_mov_b32_e32 v11, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: .LBB15_13: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_bfi_b32 v7, s4, v11, v10 +; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7] +; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3] +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26 +; SI-NEXT: s_sub_i32 s5, s5, 26 +; SI-NEXT: s_cmp_gt_i32 s5, 26 +; SI-NEXT: s_cbranch_scc1 .LBB15_13 +; SI-NEXT: ; %bb.14: ; %Flow +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: .LBB15_15: ; %frem.loop_exit +; SI-NEXT: s_sub_i32 s2, s5, 25 +; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v8, 0x43300000 +; SI-NEXT: v_bfi_b32 v9, s2, v8, v7 +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9] +; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; SI-NEXT: .LBB15_16: ; %Flow49 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x7ff00000 +; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const_one_denum: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB15_2 +; CI-NEXT: ; %bb.1: ; %frem.else16 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB15_3 +; CI-NEXT: s_branch .LBB15_8 +; CI-NEXT: .LBB15_2: +; CI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CI-NEXT: .LBB15_3: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; CI-NEXT: s_cbranch_vccnz .LBB15_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6 +; CI-NEXT: .LBB15_5: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_subrev_i32_e32 v8, vcc, 26, v8 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; CI-NEXT: s_cbranch_vccnz .LBB15_5 +; CI-NEXT: ; %bb.6: ; %Flow50 +; CI-NEXT: v_mov_b32_e32 v4, v6 +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; CI-NEXT: v_subrev_i32_e32 v6, vcc, 25, v8 +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; CI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; CI-NEXT: .LBB15_8: +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB15_10 +; CI-NEXT: ; %bb.9: ; %frem.else +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB15_11 +; CI-NEXT: s_branch .LBB15_16 +; CI-NEXT: .LBB15_10: +; CI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CI-NEXT: .LBB15_11: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v8 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; CI-NEXT: s_cbranch_vccnz .LBB15_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v10, vcc, 25, v8 +; CI-NEXT: .LBB15_13: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; CI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_subrev_i32_e32 v10, vcc, 26, v10 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; CI-NEXT: s_cbranch_vccnz .LBB15_13 +; CI-NEXT: ; %bb.14: ; %Flow +; CI-NEXT: v_mov_b32_e32 v6, v8 +; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: .LBB15_15: ; %frem.loop_exit +; CI-NEXT: v_subrev_i32_e32 v8, vcc, 25, v10 +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; CI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; CI-NEXT: .LBB15_16: ; %Flow49 +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s5, 0x7ff00000 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] +; CI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] +; CI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const_one_denum: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %frem.else16 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB15_3 +; VI-NEXT: s_branch .LBB15_8 +; VI-NEXT: .LBB15_2: +; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; VI-NEXT: .LBB15_3: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; VI-NEXT: s_cbranch_vccnz .LBB15_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v8, vcc, 25, v6 +; VI-NEXT: .LBB15_5: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; VI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_subrev_u32_e32 v8, vcc, 26, v8 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; VI-NEXT: s_cbranch_vccnz .LBB15_5 +; VI-NEXT: ; %bb.6: ; %Flow50 +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: .LBB15_7: ; %frem.loop_exit24 +; VI-NEXT: v_subrev_u32_e32 v6, vcc, 25, v8 +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; VI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; VI-NEXT: .LBB15_8: +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB15_10 +; VI-NEXT: ; %bb.9: ; %frem.else +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; VI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB15_11 +; VI-NEXT: s_branch .LBB15_16 +; VI-NEXT: .LBB15_10: +; VI-NEXT: ; implicit-def: $vgpr6_vgpr7 +; VI-NEXT: .LBB15_11: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v8 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; VI-NEXT: s_cbranch_vccnz .LBB15_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v10, vcc, 25, v8 +; VI-NEXT: .LBB15_13: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; VI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_subrev_u32_e32 v10, vcc, 26, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; VI-NEXT: s_cbranch_vccnz .LBB15_13 +; VI-NEXT: ; %bb.14: ; %Flow +; VI-NEXT: v_mov_b32_e32 v6, v8 +; VI-NEXT: v_mov_b32_e32 v7, v9 +; VI-NEXT: .LBB15_15: ; %frem.loop_exit +; VI-NEXT: v_subrev_u32_e32 v8, vcc, 25, v10 +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; VI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_bfi_b32 v7, s2, v7, v3 +; VI-NEXT: .LBB15_16: ; %Flow49 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x7ff00000 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const_one_denum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else16 +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-NEXT: s_branch .LBB15_8 +; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: .LBB15_3: ; %frem.compute15 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX9-NEXT: v_add_u32_e32 v8, -1, v6 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX9-NEXT: v_add_u32_e32 v8, 25, v6 +; GFX9-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v8, 26, v8 +; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX9-NEXT: ; %bb.6: ; %Flow50 +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX9-NEXT: v_subrev_u32_e32 v6, 25, v8 +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 +; GFX9-NEXT: .LBB15_8: +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB15_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB15_11 +; GFX9-NEXT: s_branch .LBB15_16 +; GFX9-NEXT: .LBB15_10: +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX9-NEXT: .LBB15_11: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX9-NEXT: v_add_u32_e32 v10, -1, v8 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX9-NEXT: v_add_u32_e32 v10, 25, v8 +; GFX9-NEXT: .LBB15_13: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX9-NEXT: v_subrev_u32_e32 v10, 26, v10 +; GFX9-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_13 +; GFX9-NEXT: ; %bb.14: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX9-NEXT: v_subrev_u32_e32 v8, 25, v10 +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3 +; GFX9-NEXT: .LBB15_16: ; %Flow49 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 +; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const_one_denum: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB15_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else16 +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB15_3 +; GFX10-NEXT: s_branch .LBB15_8 +; GFX10-NEXT: .LBB15_2: +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB15_3: ; %frem.compute15 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX10-NEXT: v_readfirstlane_b32 s2, v6 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX10-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX10-NEXT: s_add_i32 s2, s2, 25 +; GFX10-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX10-NEXT: ; %bb.6: ; %Flow50 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX10-NEXT: .LBB15_8: +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB15_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX10-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB15_11 +; GFX10-NEXT: s_branch .LBB15_16 +; GFX10-NEXT: .LBB15_10: +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-NEXT: .LBB15_11: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX10-NEXT: v_readfirstlane_b32 s2, v8 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX10-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX10-NEXT: s_add_i32 s2, s2, 25 +; GFX10-NEXT: .LBB15_13: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX10-NEXT: ; %bb.14: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, v9 +; GFX10-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX10-NEXT: .LBB15_16: ; %Flow49 +; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const_one_denum: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else16 +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB15_3 +; GFX11-NEXT: s_branch .LBB15_8 +; GFX11-NEXT: .LBB15_2: +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB15_3: ; %frem.compute15 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX11-NEXT: s_add_i32 s2, s2, 25 +; GFX11-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX11-NEXT: ; %bb.6: ; %Flow50 +; GFX11-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX11-NEXT: .LBB15_8: +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB15_10 +; GFX11-NEXT: ; %bb.9: ; %frem.else +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX11-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB15_11 +; GFX11-NEXT: s_branch .LBB15_16 +; GFX11-NEXT: .LBB15_10: +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: .LBB15_11: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX11-NEXT: s_add_i32 s2, s2, 25 +; GFX11-NEXT: .LBB15_13: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX11-NEXT: ; %bb.14: ; %Flow +; GFX11-NEXT: v_mov_b32_e32 v6, v8 +; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX11-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX11-NEXT: .LBB15_16: ; %Flow49 +; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const_one_denum: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB15_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB15_3 +; GFX1150-NEXT: s_branch .LBB15_8 +; GFX1150-NEXT: .LBB15_2: +; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1150-NEXT: .LBB15_3: ; %frem.compute15 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1150-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX1150-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX1150-NEXT: s_add_i32 s2, s2, 25 +; GFX1150-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1150-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow50 +; GFX1150-NEXT: v_mov_b32_e32 v4, v6 +; GFX1150-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX1150-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX1150-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1150-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1150-NEXT: .LBB15_8: +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB15_10 +; GFX1150-NEXT: ; %bb.9: ; %frem.else +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX1150-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB15_11 +; GFX1150-NEXT: s_branch .LBB15_16 +; GFX1150-NEXT: .LBB15_10: +; GFX1150-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1150-NEXT: .LBB15_11: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1150-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX1150-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_add_i32 s2, s2, 25 +; GFX1150-NEXT: .LBB15_13: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX1150-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX1150-NEXT: ; %bb.14: ; %Flow +; GFX1150-NEXT: v_mov_b32_e32 v6, v8 +; GFX1150-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX1150-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX1150-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 +; GFX1150-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX1150-NEXT: .LBB15_16: ; %Flow49 +; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX1150-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const_one_denum: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_cbranch_vccz .LBB15_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 +; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB15_3 +; GFX1200-NEXT: s_branch .LBB15_8 +; GFX1200-NEXT: .LBB15_2: +; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1200-NEXT: .LBB15_3: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1200-NEXT: v_add_nc_u32_e32 v8, -1, v6 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 +; GFX1200-NEXT: s_cbranch_vccnz .LBB15_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 +; GFX1200-NEXT: .LBB15_5: ; %frem.loop_body23 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] +; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow50 +; GFX1200-NEXT: v_mov_b32_e32 v4, v6 +; GFX1200-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 +; GFX1200-NEXT: .LBB15_7: ; %frem.loop_exit24 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] +; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[6:7] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] +; GFX1200-NEXT: v_add_f64_e32 v[6:7], 1.0, v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1200-NEXT: .LBB15_8: +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccz .LBB15_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 +; GFX1200-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB15_11 +; GFX1200-NEXT: s_branch .LBB15_16 +; GFX1200-NEXT: .LBB15_10: +; GFX1200-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1200-NEXT: .LBB15_11: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1200-NEXT: v_add_nc_u32_e32 v10, -1, v8 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 +; GFX1200-NEXT: s_cbranch_vccnz .LBB15_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 +; GFX1200-NEXT: .LBB15_13: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[6:7] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[10:11], 1.0, v[6:7] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB15_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v6, v8 +; GFX1200-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 +; GFX1200-NEXT: .LBB15_15: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] +; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[6:7] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 +; GFX1200-NEXT: .LBB15_16: ; %Flow49 +; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 +; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo +; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 + %r1 = frem <2 x double> %r0, <double 1.0, double 1.0> + store <2 x double> %r1, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 { +; SI-LABEL: frem_v2f64_const: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_v2f64_const: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_v2f64_const: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64_const: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64_const: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64_const: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1150-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mov_b32_e32 v2, v0 +; GFX1150-NEXT: v_mov_b32_e32 v3, v0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64_const: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1200-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm + %r0 = frem <2 x double> <double 1.0, double 1.0>, <double 2.0, double 1.0> + store <2 x double> %r0, ptr addrspace(1) %out, align 16 + ret void +} + + + attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } + + |