diff options
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/udivrem24.ll | 1984 |
1 files changed, 1840 insertions, 144 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll index 5477d62..1e5ec59 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -1,18 +1,103 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -; FUNC-LABEL: {{^}}udiv24_i8: -; SI: v_cvt_f32_ubyte -; SI-DAG: v_cvt_f32_ubyte -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: udiv24_i8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i8: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 %num = load i8, ptr addrspace(1) %in %den = load i8, ptr addrspace(1) %den_ptr @@ -21,17 +106,101 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out: -; SI: v_cvt_f32_ubyte -; SI-DAG: v_cvt_f32_ubyte -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: udiv24_i8_denorm_flush_in_out: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i8_denorm_flush_in_out: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i8_denorm_flush_in_out: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 %num = load i8, ptr addrspace(1) %in %den = load i8, ptr addrspace(1) %den_ptr @@ -40,17 +209,101 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in: -; SI: v_cvt_f32_ubyte -; SI-DAG: v_cvt_f32_ubyte -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; SI-LABEL: udiv24_i8_denorm_flush_in: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i8_denorm_flush_in: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i8_denorm_flush_in: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 %num = load i8, ptr addrspace(1) %in %den = load i8, ptr addrspace(1) %den_ptr @@ -59,17 +312,101 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr ret void } -; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out: -; SI: v_cvt_f32_ubyte -; SI-DAG: v_cvt_f32_ubyte -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { +; SI-LABEL: udiv24_i8_denorm_flush_out: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i8_denorm_flush_out: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i8_denorm_flush_out: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 %num = load i8, ptr addrspace(1) %in %den = load i8, ptr addrspace(1) %den_ptr @@ -78,17 +415,101 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr ret void } -; FUNC-LABEL: {{^}}udiv24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: udiv24_i16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i16: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T1.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 %num = load i16, ptr addrspace(1) %in, align 2 %den = load i16, ptr addrspace(1) %den_ptr, align 2 @@ -97,17 +518,85 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}udiv23_i32: -; SI: v_cvt_f32_u32 -; SI-DAG: v_cvt_f32_u32 -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: udiv23_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0x7fffff +; SI-NEXT: s_and_b32 s5, s5, 0x7fffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv23_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv23_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Y, PV.W, +; EG-NEXT: AND_INT T0.W, T0.X, literal.x, +; EG-NEXT: RECIP_IEEE * T0.X, PS, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PS, PV.W, +; EG-NEXT: AND_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -120,11 +609,88 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}udiv24_i32: -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: udiv24_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0xffffff +; SI-NEXT: s_and_b32 s5, s5, 0xffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv24_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv24_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -137,11 +703,88 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32: -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: no_udiv24_u23_u24_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0x7fffff +; SI-NEXT: s_and_b32 s5, s5, 0xffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: no_udiv24_u23_u24_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: no_udiv24_u23_u24_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -154,11 +797,88 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr ret void } -; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32: -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: no_udiv24_u24_u23_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0xffffff +; SI-NEXT: s_and_b32 s5, s5, 0x7fffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: no_udiv24_u24_u23_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: no_udiv24_u24_u23_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -171,14 +891,113 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr ret void } -; FUNC-LABEL: {{^}}udiv25_i32: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: udiv25_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0x1ffffff +; SI-NEXT: s_and_b32 s5, s5, 0x1ffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 +; SI-NEXT: s_sub_i32 s6, 0, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s6, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s4, v0 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_mul_i32 s6, s6, s5 +; SI-NEXT: s_sub_i32 s4, s4, s6 +; SI-NEXT: s_sub_i32 s6, s4, s5 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b32 s4, s6, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: udiv25_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0x1ffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: udiv25_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -191,14 +1010,113 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_no_udiv24_i32_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0xffffff +; SI-NEXT: s_and_b32 s5, s5, 0x1ffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 +; SI-NEXT: s_sub_i32 s6, 0, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s6, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s4, v0 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_mul_i32 s6, s6, s5 +; SI-NEXT: s_sub_i32 s4, s4, s6 +; SI-NEXT: s_sub_i32 s6, s4, s5 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b32 s4, s6, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_no_udiv24_i32_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0x1ffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0xffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_no_udiv24_i32_1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -211,14 +1129,113 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs ret void } -; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_no_udiv24_i32_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0x1ffffff +; SI-NEXT: s_and_b32 s5, s5, 0xffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 +; SI-NEXT: s_sub_i32 s6, 0, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s6, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s4, v0 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_mul_i32 s6, s6, s5 +; SI-NEXT: s_sub_i32 s4, s4, s6 +; SI-NEXT: s_sub_i32 s6, s4, s5 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b32 s4, s6, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_cmp_ge_u32 s4, s5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_no_udiv24_i32_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_no_udiv24_i32_2: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T3.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -231,17 +1248,107 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs ret void } -; FUNC-LABEL: {{^}}urem24_i8: -; SI: v_cvt_f32_ubyte -; SI-DAG: v_cvt_f32_ubyte -; SI-DAG: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: urem24_i8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; SI-NEXT: v_mul_f32_e32 v4, v2, v4 +; SI-NEXT: v_trunc_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v2, -v4, v3, v2 +; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_mul_lo_u32 v1, v2, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: urem24_i8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; VI-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 +; VI-NEXT: v_mul_f32_e32 v3, v4, v3 +; VI-NEXT: v_trunc_f32_e32 v3, v3 +; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 +; VI-NEXT: v_mad_f32 v3, -v3, v2, v4 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; VI-NEXT: v_mul_lo_u32 v0, v2, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: urem24_i8: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 25, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.W, T0.X, +; EG-NEXT: MUL_IEEE * T1.W, PS, T0.Z, +; EG-NEXT: TRUNC * T1.W, PV.W, +; EG-NEXT: MULADD_IEEE T0.W, -PV.W, T0.Y, T0.W, +; EG-NEXT: TRUNC * T1.W, PV.W, +; EG-NEXT: SETGE * T0.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.Y, T1.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PV.W, T1.X, +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT * T1.W, T0.X, PS, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 %num = load i8, ptr addrspace(1) %in %den = load i8, ptr addrspace(1) %den_ptr @@ -250,17 +1357,107 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}urem24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_iflag_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: urem24_i16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_u32_e32 v2, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_u32_e32 v3, v1 +; SI-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; SI-NEXT: v_mul_f32_e32 v4, v2, v4 +; SI-NEXT: v_trunc_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v2, -v4, v3, v2 +; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_mul_lo_u32 v1, v2, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: urem24_i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_u32_e32 v2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_u32_e32 v3, v1 +; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_trunc_f32_e32 v4, v4 +; VI-NEXT: v_cvt_u32_f32_e32 v5, v4 +; VI-NEXT: v_mad_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; VI-NEXT: v_mul_lo_u32 v0, v2, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: urem24_i16: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 25, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, +; EG-NEXT: RECIP_IEEE * T0.Z, PS, +; EG-NEXT: UINT_TO_FLT * T0.W, T0.X, +; EG-NEXT: MUL_IEEE * T1.W, PS, T0.Z, +; EG-NEXT: TRUNC * T1.W, PV.W, +; EG-NEXT: MULADD_IEEE T0.W, -PV.W, T0.Y, T0.W, +; EG-NEXT: TRUNC * T1.W, PV.W, +; EG-NEXT: SETGE * T0.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.Y, T1.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PV.W, T1.X, +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT * T1.W, T0.X, PS, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 %num = load i16, ptr addrspace(1) %in, align 2 %den = load i16, ptr addrspace(1) %den_ptr, align 2 @@ -269,10 +1466,90 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}urem24_i32: -; SI-NOT: v_rcp_f32 -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: urem24_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s6, s4, 0xffffff +; SI-NEXT: s_and_b32 s7, s5, 0xffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s6 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s7 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_mul_lo_u32 v0, v0, s5 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: urem24_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s2, s5, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 +; VI-NEXT: s_and_b32 s2, s4, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_mul_lo_u32 v0, v0, s5 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: urem24_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.X, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -285,14 +1562,105 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}urem25_i32: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: urem25_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s2, s4, 0x1ffffff +; SI-NEXT: s_and_b32 s4, s5, 0x1ffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: s_sub_i32 s5, 0, s4 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s5, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s2, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_mul_i32 s5, s5, s4 +; SI-NEXT: s_sub_i32 s2, s2, s5 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s2, s5, s2 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s4, s5, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: urem25_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0x1ffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s4, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: urem25_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.X, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -305,14 +1673,105 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}test_no_urem24_i32_1: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_no_urem24_i32_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s2, s4, 0xffffff +; SI-NEXT: s_and_b32 s4, s5, 0x1ffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: s_sub_i32 s5, 0, s4 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s5, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s2, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_mul_i32 s5, s5, s4 +; SI-NEXT: s_sub_i32 s2, s2, s5 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s2, s5, s2 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s4, s5, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_no_urem24_i32_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0x1ffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0xffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s4, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_no_urem24_i32_1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.X, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -325,14 +1784,105 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs ret void } -; FUNC-LABEL: {{^}}test_no_urem24_i32_2: ; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_no_urem24_i32_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s2, s4, 0x1ffffff +; SI-NEXT: s_and_b32 s4, s5, 0xffffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: s_sub_i32 s5, 0, s4 +; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; SI-NEXT: v_mul_lo_u32 v1, s5, v0 +; SI-NEXT: v_mul_hi_u32 v1, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_mul_hi_u32 v0, s2, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_mul_i32 s5, s5, s4 +; SI-NEXT: s_sub_i32 s2, s2, s5 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s2, s5, s2 +; SI-NEXT: s_sub_i32 s5, s2, s4 +; SI-NEXT: s_cmp_ge_u32 s2, s4 +; SI-NEXT: s_cselect_b32 s4, s5, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_no_urem24_i32_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s3, 0xffffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 +; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s5, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sub_i32 s5, s5, s6 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s5, s6, s5 +; VI-NEXT: s_sub_i32 s6, s5, s4 +; VI-NEXT: s_cmp_ge_u32 s5, s4 +; VI-NEXT: s_cselect_b32 s4, s6, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_no_urem24_i32_2: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: MULHI * T0.Z, T0.Y, PS, +; EG-NEXT: ADD_INT T1.W, T0.Y, PS, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 33554431(9.403954e-38), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PS, PV.W, +; EG-NEXT: MULLO_INT * T0.X, PS, T0.W, +; EG-NEXT: SUB_INT * T1.W, T2.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -345,12 +1895,85 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs ret void } -; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: -; SI: v_rcp_iflag_f32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, - -; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_udiv24_u16_u23_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0x7fffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_udiv24_u16_u23_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_udiv24_u16_u23_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Y, PV.W, +; EG-NEXT: AND_INT T0.W, T0.X, literal.x, +; EG-NEXT: RECIP_IEEE * T0.X, PS, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PS, PV.W, +; EG-NEXT: AND_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 @@ -363,12 +1986,85 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad ret void } -; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: -; SI: v_rcp_iflag_f32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, - -; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_udiv24_u23_u16_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, 0x7fffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_trunc_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_udiv24_u23_u16_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 +; VI-NEXT: s_and_b32 s2, s2, 0x7fffff +; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mul_f32_e32 v2, v1, v2 +; VI-NEXT: v_trunc_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 +; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 +; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: test_udiv24_u23_u16_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Y, PV.W, +; EG-NEXT: AND_INT T0.W, T0.X, literal.x, +; EG-NEXT: RECIP_IEEE * T0.X, PS, +; EG-NEXT: 8388607(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W, +; EG-NEXT: MUL_IEEE * T0.W, PS, T0.X, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.Z, +; EG-NEXT: TRUNC * T0.W, PV.W, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, +; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, +; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PS, PV.W, +; EG-NEXT: AND_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45) %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %num = load i32, ptr addrspace(1) %in, align 4 %den = load i32, ptr addrspace(1) %den_ptr, align 4 |
