diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
24 files changed, 5846 insertions, 5280 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index e411c23..7b5621f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) @@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1cd9c0b..2351c96 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v3, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7..7f10ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i64: @@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] ; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: @@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] ; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] ; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5 ; GFX1250-NEXT: v_mov_b32_e32 v8, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result @@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 +; GFX7-NEXT: v_mov_b32_e32 v11, v3 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v11 -; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX7-NEXT: v_mov_b32_e32 v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: @@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v11, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: @@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i128: @@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] @@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 -; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v11 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mov_b32_e32 v2, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo @@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1] ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11] -; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11] +; GFX1250-NEXT: v_mov_b32_e32 v10, v1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v13, v10 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13] +; GFX1250-NEXT: v_mov_b32_e32 v11, v12 +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11] ; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1 @@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX7-NEXT: v_mov_b32_e32 v22, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX7-NEXT: v_mov_b32_e32 v21, v22 +; GFX7-NEXT: v_mov_b32_e32 v22, v23 +; GFX7-NEXT: v_mov_b32_e32 v23, v18 +; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX7-NEXT: v_mov_b32_e32 v20, v23 +; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v21, v20 -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX7-NEXT: v_mov_b32_e32 v12, v22 +; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v14 +; GFX7-NEXT: v_mov_b32_e32 v7, v11 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX8-NEXT: v_mov_b32_e32 v22, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX8-NEXT: v_mov_b32_e32 v21, v22 +; GFX8-NEXT: v_mov_b32_e32 v22, v23 +; GFX8-NEXT: v_mov_b32_e32 v23, v18 +; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX8-NEXT: v_mov_b32_e32 v20, v23 +; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v21, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v12, v22 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc -; GFX9-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc +; GFX9-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v18 +; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX9-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 +; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v18, v23 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX10-NEXT: v_mov_b32_e32 v19, v24 +; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX10-NEXT: v_mov_b32_e32 v18, v21 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 +; GFX10-NEXT: v_mov_b32_e32 v14, v20 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 ; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 ; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v20, v8 -; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v21, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 +; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v6, v25 -; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] +; GFX11-NEXT: v_mov_b32_e32 v7, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX11-NEXT: v_mov_b32_e32 v8, v24 +; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] +; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, v24 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12] -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] +; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 ; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX12-NEXT: v_mov_b32_e32 v19, v24 +; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] ; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX12-NEXT: v_mov_b32_e32 v18, v21 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 +; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21 +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 -; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 -; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 +; GFX1250-NEXT: v_mov_b32_e32 v13, v18 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21] ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1] +; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2 ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] -; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] -; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2 ; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, v15 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v14 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_zext_with_vregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX8-NEXT: flat_load_dword v4, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_vregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_vregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_vregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v4, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_zext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_sext_with_vregs: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_vregs: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: s_mul_u64_sext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_sext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4f2c454..2843f72 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v8, v6 ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s13 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s13 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s11 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, s11 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4 ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 @@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] ; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v5, v4 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] @@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 @@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 +; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 @@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 @@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc +; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v7, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 @@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 ; CGP-NEXT: v_trunc_f32_e32 v12, v11 ; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 @@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v10, v8 ; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 +; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 @@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 ; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 ; GISEL-NEXT: v_trunc_f32_e32 v8, v6 ; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: @@ -2757,47 +2757,47 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v6 +; CGP-NEXT: v_and_b32_e32 v9, 0xffffff, v2 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v7, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v6 +; CGP-NEXT: v_rcp_f32_e32 v8, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 ; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3 -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v3 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v2, v1 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v5 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591..f4489c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3 @@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s17, 31 ; GFX9-NEXT: s_ashr_i32 s4, s19, 31 @@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_ashr_i32 s10, s3, 31 -; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: s_add_u32 s0, s18, s6 ; GFX8-NEXT: s_addc_u32 s1, s19, s6 ; GFX8-NEXT: s_add_u32 s2, s2, s10 @@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_addc_u32 s3, s3, s10 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: s_subb_u32 s20, 0, s3 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v6, s17 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] +; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 ; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v12, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9 ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] ; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 40b5db0..6f42239 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v6, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s9 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] @@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v11, v9 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 @@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v12, v9 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v4, v3 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 4096, i64 4096> ret <2 x i64> %result @@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v7, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v12, v10 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 ; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 ; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 @@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 ; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 ; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 ; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v13, v10 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v12, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 +; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc @@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3 @@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 9e412b6..23ef596 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 -; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s19 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 +; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX8-NEXT: v_trunc_f32_e32 v4, v3 ; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 +; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 +; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] @@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v4, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 ; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll new file mode 100644 index 0000000..d7d623a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +; Make sure we do not infer anything about implicit inputs through an +; intrinsic call which is not nocallback. + +declare zeroext i32 @return_i32() + +define i32 @test_i32_return() gc "statepoint-example" { +; CHECK-LABEL: define i32 @test_i32_return( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]]) +; CHECK-NEXT: ret i32 [[CALL1]] +; +entry: + %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + ret i32 %call1 +} + +declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...) +declare i32 @llvm.experimental.gc.result.i32(token) #0 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll new file mode 100644 index 0000000..71c509a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s + +; Make sure we infer no inputs are used through some intrinsics + +define void @use_fake_use(i32 %arg) { +; CHECK-LABEL: define void @use_fake_use( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]]) +; CHECK-NEXT: ret void +; + call void (...) @llvm.fake.use(i32 %arg) + ret void +} + +define void @use_donothing() { +; CHECK-LABEL: define void @use_donothing( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: ret void +; + call void @llvm.donothing() + ret void +} + +define void @use_assume(i1 %arg) { +; CHECK-LABEL: define void @use_assume( +; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]]) +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 %arg) + ret void +} + +define void @use_trap() { +; CHECK-LABEL: define void @use_trap( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: ret void +; + call void @llvm.trap() + ret void +} + +define void @use_debugtrap() { +; CHECK-LABEL: define void @use_debugtrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.debugtrap() +; CHECK-NEXT: ret void +; + call void @llvm.debugtrap() + ret void +} + +define void @use_ubsantrap() { +; CHECK-LABEL: define void @use_ubsantrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.ubsantrap(i8 0) +; CHECK-NEXT: ret void +; + call void @llvm.ubsantrap(i8 0) + ret void +} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 0fc54ae..26f7789 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 +; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 +; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] ; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v14 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mov_b32_e32 v23, v25 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc +; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 -; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 ; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 -; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc @@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 -; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 -; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20 ; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 -; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 +; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] +; GISEL-NEXT: v_mov_b32_e32 v18, v26 +; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 0548bcf..279f429 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -34,3 +34,16 @@ body: | $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec ... + +--- +name: test_tied +body: | + bb.0: + ; CHECK-LABEL: name: test_tied + ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index e042157..460f121 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_7: ; %Flow2 @@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB1_7: ; %Flow2 @@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB6_4: ; %Flow @@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB7_4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b53..f705a2f 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5] ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0 ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index fa52b96..02eda2c 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -6,40 +6,12 @@ # No more registers shall be defined --- name: main -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false tracksRegLiveness: true registers: - - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } - - { id: 2, class: vreg_64, preferred-register: '%2' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64 } - - { id: 6, class: vreg_96 } - - { id: 7, class: vreg_96 } - - { id: 8, class: vreg_128 } - - { id: 9, class: vreg_128 } -liveins: - - { reg: '$sgpr6', virtual-reg: '%1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false + - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } + - { id: 1, class: vreg_64, preferred-register: '%1' } body: | - bb.0.entry: + bb.0: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-LABEL: name: main @@ -59,20 +31,21 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr - %3 = IMPLICIT_DEF - undef %4.sub0 = COPY $sgpr0 - %4.sub1 = COPY %3.sub0 - undef %5.sub0 = COPY %4.sub1 - %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr + %2:vreg_64 = IMPLICIT_DEF + undef %3.sub0:vreg_64 = COPY $sgpr0 + %3.sub1:vreg_64 = COPY %2.sub0 + undef %4.sub0:vreg_64 = COPY %3.sub1 + %4.sub1:vreg_64 = COPY %3.sub0 + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr - %6 = IMPLICIT_DEF - undef %7.sub0_sub1 = COPY %6 - %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr + %5:vreg_96 = IMPLICIT_DEF + undef %6.sub0_sub1:vreg_96 = COPY %5 + %6.sub2:vreg_96 = COPY %2.sub0 + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr + + %7:vreg_128 = IMPLICIT_DEF + undef %8.sub0_sub1_sub2:vreg_128 = COPY %7 + %8.sub3:vreg_128 = COPY %2.sub0 + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr - %8 = IMPLICIT_DEF - undef %9.sub0_sub1_sub2 = COPY %8 - %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr ... diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 704ea37..8e7389a 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1ab4cb0..d82d6bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 -; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 -; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5 -; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7 -; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 -; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 -; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 -; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 +; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 +; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5 +; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7 +; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9 +; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11 +; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13 +; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15 ; GISEL12-NEXT: s_mov_b32 exec_lo, s9 -; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41 +; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43 +; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45 +; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47 +; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49 +; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51 +; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53 +; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55 ; GISEL12-NEXT: .LBB5_2: ; %tail ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GISEL10-NEXT: v_mov_b32_e32 v24, v0 -; GISEL10-NEXT: v_mov_b32_e32 v25, v1 -; GISEL10-NEXT: v_mov_b32_e32 v26, v2 -; GISEL10-NEXT: v_mov_b32_e32 v27, v3 -; GISEL10-NEXT: v_mov_b32_e32 v28, v4 -; GISEL10-NEXT: v_mov_b32_e32 v29, v5 -; GISEL10-NEXT: v_mov_b32_e32 v30, v6 -; GISEL10-NEXT: v_mov_b32_e32 v31, v7 -; GISEL10-NEXT: v_mov_b32_e32 v32, v8 -; GISEL10-NEXT: v_mov_b32_e32 v33, v9 -; GISEL10-NEXT: v_mov_b32_e32 v34, v10 -; GISEL10-NEXT: v_mov_b32_e32 v35, v11 -; GISEL10-NEXT: v_mov_b32_e32 v36, v12 -; GISEL10-NEXT: v_mov_b32_e32 v37, v13 -; GISEL10-NEXT: v_mov_b32_e32 v38, v14 -; GISEL10-NEXT: v_mov_b32_e32 v39, v15 +; GISEL10-NEXT: v_mov_b32_e32 v40, v0 +; GISEL10-NEXT: v_mov_b32_e32 v41, v1 +; GISEL10-NEXT: v_mov_b32_e32 v42, v2 +; GISEL10-NEXT: v_mov_b32_e32 v43, v3 +; GISEL10-NEXT: v_mov_b32_e32 v44, v4 +; GISEL10-NEXT: v_mov_b32_e32 v45, v5 +; GISEL10-NEXT: v_mov_b32_e32 v46, v6 +; GISEL10-NEXT: v_mov_b32_e32 v47, v7 +; GISEL10-NEXT: v_mov_b32_e32 v48, v8 +; GISEL10-NEXT: v_mov_b32_e32 v49, v9 +; GISEL10-NEXT: v_mov_b32_e32 v50, v10 +; GISEL10-NEXT: v_mov_b32_e32 v51, v11 +; GISEL10-NEXT: v_mov_b32_e32 v52, v12 +; GISEL10-NEXT: v_mov_b32_e32 v53, v13 +; GISEL10-NEXT: v_mov_b32_e32 v54, v14 +; GISEL10-NEXT: v_mov_b32_e32 v55, v15 ; GISEL10-NEXT: s_mov_b32 exec_lo, s9 -; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL10-NEXT: v_mov_b32_e32 v24, v40 +; GISEL10-NEXT: v_mov_b32_e32 v25, v41 +; GISEL10-NEXT: v_mov_b32_e32 v26, v42 +; GISEL10-NEXT: v_mov_b32_e32 v27, v43 +; GISEL10-NEXT: v_mov_b32_e32 v28, v44 +; GISEL10-NEXT: v_mov_b32_e32 v29, v45 +; GISEL10-NEXT: v_mov_b32_e32 v30, v46 +; GISEL10-NEXT: v_mov_b32_e32 v31, v47 +; GISEL10-NEXT: v_mov_b32_e32 v32, v48 +; GISEL10-NEXT: v_mov_b32_e32 v33, v49 +; GISEL10-NEXT: v_mov_b32_e32 v34, v50 +; GISEL10-NEXT: v_mov_b32_e32 v35, v51 +; GISEL10-NEXT: v_mov_b32_e32 v36, v52 +; GISEL10-NEXT: v_mov_b32_e32 v37, v53 +; GISEL10-NEXT: v_mov_b32_e32 v38, v54 +; GISEL10-NEXT: v_mov_b32_e32 v39, v55 ; GISEL10-NEXT: .LBB5_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GISEL10-NEXT: v_mov_b32_e32 v8, v24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 18c462f..dd2cffd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -77,17 +77,53 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; -; GFX11CHECK-LABEL: sgpr_isnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 -; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_endpgm +; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1 +; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0 +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, s2, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11SELDAG-TRUE16-NEXT: s_endpgm +; +; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1 +; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11SELDAG-FAKE16-NEXT: s_endpgm +; +; GFX11GLISEL-TRUE16-LABEL: sgpr_isnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_clause 0x1 +; GFX11GLISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GLISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0 +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, s2, v0.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11GLISEL-TRUE16-NEXT: s_endpgm +; +; GFX11GLISEL-FAKE16-LABEL: sgpr_isnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_clause 0x1 +; GFX11GLISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GLISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11GLISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11GLISEL-FAKE16-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -212,8 +248,9 @@ define i1 @snan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: snan_f16: @@ -226,8 +263,9 @@ define i1 @snan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: snan_f16: @@ -285,8 +323,9 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 2 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: qnan_f16: @@ -299,8 +338,9 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 2 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: qnan_f16: @@ -358,8 +398,9 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posinf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posinf_f16: @@ -372,8 +413,9 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posinf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posinf_f16: @@ -429,8 +471,9 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: neginf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 4 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: neginf_f16: @@ -443,8 +486,9 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: neginf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 4 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: neginf_f16: @@ -514,8 +558,9 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x100 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posnormal_f16: @@ -528,8 +573,9 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x100 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posnormal_f16: @@ -597,8 +643,9 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negnormal_f16: @@ -611,8 +658,9 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negnormal_f16: @@ -673,8 +721,9 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: possubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x80 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: possubnormal_f16: @@ -687,8 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: possubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x80 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: possubnormal_f16: @@ -755,8 +805,9 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negsubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 16 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negsubnormal_f16: @@ -769,8 +820,9 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negsubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 16 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negsubnormal_f16: @@ -824,8 +876,9 @@ define i1 @poszero_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: poszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 64 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: poszero_f16: @@ -838,8 +891,9 @@ define i1 @poszero_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: poszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 64 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: poszero_f16: @@ -895,8 +949,9 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negzero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 32 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negzero_f16: @@ -909,8 +964,9 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negzero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 32 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negzero_f16: @@ -968,8 +1024,9 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1c0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posfinite_f16: @@ -982,8 +1039,9 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1c0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posfinite_f16: @@ -1047,8 +1105,9 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 56 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negfinite_f16: @@ -1061,8 +1120,9 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 56 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negfinite_f16: @@ -1120,8 +1180,9 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnan_f16: @@ -1134,8 +1195,9 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_f16: @@ -1195,8 +1257,9 @@ define i1 @not_isnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3fc +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnan_f16: @@ -1209,8 +1272,9 @@ define i1 @not_isnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3fc +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnan_f16: @@ -1336,11 +1400,13 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v2f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v2f16: @@ -1499,13 +1565,17 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v3f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v4, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.l, v5.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v3f16: @@ -1693,16 +1763,20 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v4f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v4, 3 :: v_dual_mov_b32 v5, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.l, v6.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v4 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.h, v7.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, v5 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v4f16: @@ -1771,8 +1845,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX11SELDAG-TRUE16-LABEL: isnan_f16_strictfp: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnan_f16_strictfp: @@ -1785,8 +1860,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_f16_strictfp: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_f16_strictfp: @@ -1846,8 +1922,9 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isinf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isinf_f16: @@ -1860,8 +1937,9 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isinf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isinf_f16: @@ -1921,8 +1999,9 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isfinite_f16: @@ -1935,8 +2014,9 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isfinite_f16: @@ -1994,8 +2074,9 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: issubnormal_or_zero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0xf0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: issubnormal_or_zero_f16: @@ -2008,8 +2089,9 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: issubnormal_or_zero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0xf0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: issubnormal_or_zero_f16: @@ -2074,8 +2156,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_or_zero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x30f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_or_zero_f16: @@ -2088,8 +2171,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_or_zero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x30f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_or_zero_f16: @@ -2153,8 +2237,9 @@ define i1 @isnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x108 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnormal_f16: @@ -2167,8 +2252,9 @@ define i1 @isnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x108 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnormal_f16: @@ -2236,8 +2322,9 @@ define i1 @not_isnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2f7 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnormal_f16: @@ -2250,8 +2337,9 @@ define i1 @not_isnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2f7 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnormal_f16: @@ -2330,8 +2418,9 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2ff +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_f16: @@ -2344,8 +2433,9 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_is_plus_normal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2ff +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_is_plus_normal_f16: @@ -2424,8 +2514,9 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3f7 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_f16: @@ -2438,8 +2529,9 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_is_neg_normal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3f7 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_is_neg_normal_f16: @@ -2501,8 +2593,9 @@ define i1 @issubnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: issubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x90 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: issubnormal_f16: @@ -2515,8 +2608,9 @@ define i1 @issubnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: issubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x90 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: issubnormal_f16: @@ -2586,8 +2680,9 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x36f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_f16: @@ -2600,8 +2695,9 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x36f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_f16: @@ -2659,8 +2755,9 @@ define i1 @iszero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x60 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_f16: @@ -2673,8 +2770,9 @@ define i1 @iszero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x60 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_f16: @@ -2745,8 +2843,9 @@ define i1 @not_iszero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_f16: @@ -2759,8 +2858,9 @@ define i1 @not_iszero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_f16: @@ -2818,8 +2918,9 @@ define i1 @ispositive_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: ispositive_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: ispositive_f16: @@ -2832,8 +2933,9 @@ define i1 @ispositive_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: ispositive_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: ispositive_f16: @@ -2907,8 +3009,9 @@ define i1 @not_ispositive_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_ispositive_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_ispositive_f16: @@ -2921,8 +3024,9 @@ define i1 @not_ispositive_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_ispositive_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_ispositive_f16: @@ -2992,8 +3096,9 @@ define i1 @isnegative_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isnegative_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 60 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnegative_f16: @@ -3006,8 +3111,9 @@ define i1 @isnegative_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isnegative_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 60 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnegative_f16: @@ -3074,8 +3180,9 @@ define i1 @not_isnegative_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnegative_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnegative_f16: @@ -3088,8 +3195,9 @@ define i1 @not_isnegative_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnegative_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnegative_f16: @@ -3152,8 +3260,9 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f16: @@ -3166,8 +3275,9 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f16: @@ -3231,8 +3341,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz: @@ -3245,8 +3356,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_daz: @@ -3310,8 +3422,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -3324,8 +3437,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -3398,8 +3512,9 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f16: @@ -3412,8 +3527,9 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f16: @@ -3486,8 +3602,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz: @@ -3500,8 +3617,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_daz: @@ -3574,8 +3692,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -3588,8 +3707,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -3653,8 +3773,9 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_f16: @@ -3667,8 +3788,9 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_qnan_f16: @@ -3737,8 +3859,9 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x61 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_f16: @@ -3751,8 +3874,9 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x61 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_snan_f16: @@ -3841,8 +3965,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39d +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_f16: @@ -3855,8 +3980,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39d +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_qnan_f16: @@ -3942,8 +4068,9 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39e +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_f16: @@ -3956,8 +4083,9 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39e +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_snan_f16: @@ -4018,8 +4146,9 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isinf_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x207 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isinf_or_nan_f16: @@ -4032,8 +4161,9 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isinf_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x207 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isinf_or_nan_f16: @@ -4094,8 +4224,9 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isinf_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isinf_or_nan_f16: @@ -4108,8 +4239,9 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isinf_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isinf_or_nan_f16: @@ -4170,8 +4302,9 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isfinite_or_nan_f: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1fb +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isfinite_or_nan_f: @@ -4184,8 +4317,9 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isfinite_or_nan_f: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1fb +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isfinite_or_nan_f: @@ -4246,8 +4380,9 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isfinite_or_nan_f: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isfinite_or_nan_f: @@ -4260,8 +4395,9 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isfinite_or_nan_f: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isfinite_or_nan_f: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll index 4fa7c29..7100522 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll @@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) { %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ] br label %.loopexit } + +;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select. +define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0 +; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x + ret ptr addrspace(7) %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll index aab035f..b9bf138 100644 --- a/llvm/test/CodeGen/AMDGPU/packetizer.ll +++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll @@ -1,13 +1,49 @@ -; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s - -; CHECK: {{^}}test: -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z -; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600 +; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +; R600-LABEL: test: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1, +; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1, +; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1, +; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1, +; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z, +; R600-NEXT: OR_INT T0.W, PV.W, PV.Z, +; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X, +; R600-NEXT: OR_INT T0.X, PS, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: test: +; CM: ; %bb.0: ; %entry +; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1, +; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1, +; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1, +; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z, +; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, +; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X, +; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %shl = sub i32 32, %e %x = add i32 %x_arg, 1 diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir new file mode 100644 index 0000000..381cb8c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s + +# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask. + +--- +name: reg_coalescer_subreg_liveness +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: $vcc_lo = COPY $exec_lo + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32 = S_MOV_B32 1 + undef %3.sub0:sgpr_128 = COPY %2 + %4:sreg_32 = S_MOV_B32 0 + undef %5.sub0:sgpr_256 = COPY %4 + TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %6:sgpr_128 = COPY killed %3 + %6.sub1:sgpr_128 = COPY killed %1 + %7:sreg_32 = COPY $exec_lo + %8:sreg_32 = COPY %2 + %9:sreg_32 = COPY %4 + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %10:sreg_32 = COPY killed %8 + undef %11.sub0:sgpr_128 = COPY %2 + %11.sub1:sgpr_128 = COPY killed %10 + %11.sub2:sgpr_128 = COPY %2 + %11.sub3:sgpr_128 = COPY %2 + TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %12:sreg_32 = COPY killed %9 + %13:sgpr_128 = COPY %6 + %13.sub2:sgpr_128 = COPY killed %12 + TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + $vcc_lo = COPY %7 + %8:sreg_32 = COPY %4 + %9:sreg_32 = COPY %2 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... +--- +name: reg_coalescer_subreg_liveness_2 +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %3:sreg_32 = S_MOV_B32 1 + undef %4.sub0:sgpr_128 = COPY %3 + %5:sgpr_128 = COPY %4 + %5.sub1:sgpr_128 = COPY killed %2 + %6:sgpr_128 = COPY %5 + %6.sub2:sgpr_128 = COPY killed %1 + %7:sreg_32 = S_MOV_B32 0 + undef %8.sub0:sgpr_256 = COPY %7 + %9:sreg_32 = COPY %3 + + bb.1: + successors: %bb.2(0x80000000) + + %10:sreg_32 = COPY killed %9 + undef %11.sub0:sgpr_128 = COPY %3 + %11.sub1:sgpr_128 = COPY killed %10 + S_NOP 0, implicit %5, implicit %8 + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 9484417..6706de1 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -48,7 +48,9 @@ body: | ; CHECK-LABEL: name: sgpr_lo16 ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 30 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]] %0:sreg_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF @@ -232,3 +234,34 @@ body: | $vgpr0 = COPY %3 S_ENDPGM 0, implicit $vgpr0 ... + +# Make sure the immediate materialized by the v_mov_b16 isn't +# incorrectly folded into the bfi as 0. + +# FIXME: %4:vgpr_32 = COPY %3 is a direct copy from v16 to v32 and +# should probably fail the verifier +--- +name: mov_v16_copy_v32_fold_b32_regression +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: mov_v16_copy_v32_fold_b32_regression + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B16_t16_e64_]] + ; CHECK-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 32767, [[COPY2]], [[COPY1]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_BFI_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %3:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = V_BFI_B32_e64 32767, %4, %1, implicit $exec + $vgpr0 = COPY %5 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll new file mode 100644 index 0000000..0bebb58 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s + +; Make sure that the 16-bit constant 0x3c00 isn't folded as 0 into +; v_bfi_b32. +define i32 @mov16_bfi_fold_regression(half %arg, i32 %arg1) { +; CHECK-LABEL: bfi_fold_regression: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b16_e32 v2.l, 0x3c00 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 +; CHECK-NEXT: v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_pack_b32_f16 v0, v0.l, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %cmp = icmp eq i32 %arg1, 0 + %call = call half @llvm.copysign.f16(half 0xH3C00, half %arg) + %select = select i1 %cmp, half 0xH3C00, half %call + %insertelement = insertelement <2 x half> zeroinitializer, half %select, i64 0 + %bitcast = bitcast <2 x half> %insertelement to i32 + ret i32 %bitcast +} + +declare half @llvm.copysign.f16(half, half) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 4d5ade4..1b4ed67 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11] -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4 +; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6 +; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8 +; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17] +; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14 +; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5] -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index a1381ec..f964480 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -1069,6 +1069,51 @@ body: | $sgpr0 = S_MOV_B32 $sgpr0 ... +# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. +--- +name: mixed_pending_events +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: mixed_pending_events + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr2, $vgpr2 + $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec +... + --- name: pending_vmem_event_between_block tracksRegLiveness: true |
