; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s ; RUN: not --crash llc -mtriple=r600 -mcpu=redwood < %s 2>&1 | FileCheck -check-prefix=R600-ERR %s ; R600-ERR: LLVM ERROR: unsupported library call operation define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fneg = fsub float -0.000000e+00, %in store float %fneg, ptr addrspace(1) %out ret void } define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 ; SI-NEXT: s_xor_b32 s1, s2, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %fneg = fsub <2 x float> , %in store <2 x float> %fneg, ptr addrspace(1) %out ret void } define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; SI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; SI-NEXT: s_xor_b32 s1, s1, 0x80000000 ; SI-NEXT: s_xor_b32 s0, s0, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: s_xor_b32 s1, s1, 0x80000000 ; VI-NEXT: s_xor_b32 s0, s0, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s0, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s1, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm %fneg = fsub <4 x float> , %in store <4 x float> %fneg, ptr addrspace(1) %out ret void } define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_sub_f32_e64 v0, 0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_sub_f32_e64 v2, 0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float 0.0, %bc store float %fsub, ptr addrspace(1) %out ret void } define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float -0.0, %bc store float %fsub, ptr addrspace(1) %out ret void } define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, -s6, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fsub = fsub float -0.0, %in %fmul = fmul float %fsub, %in store float %fmul, ptr addrspace(1) %out ret void } ; Make sure we turn some integer operations back into fabs define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, s6, -4.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %in.bc = bitcast float %in to i32 %int.abs = xor i32 %in.bc, 2147483648 %bc = bitcast i32 %int.abs to float %fadd = fmul float %bc, 4.0 store float %fadd, ptr addrspace(1) %out ret void } define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 store i32 %fneg, ptr addrspace(1) %out ret void } define i32 @v_fneg_i32(i32 %in) { ; GCN-LABEL: v_fneg_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i32 %in, -2147483648 ret i32 %fneg } define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_sub_f32_e64 v0, 2.0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float %fadd = fadd float %bitcast, 2.0 store float %fadd, ptr addrspace(1) %out ret void } define float @v_fneg_i32_fp_use(i32 %in) { ; GCN-LABEL: v_fneg_i32_fp_use: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float %fadd = fadd float %bitcast, 2.0 ret float %fadd } define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 store i64 %fneg, ptr addrspace(1) %out ret void } define i64 @v_fneg_i64(i64 %in) { ; GCN-LABEL: v_fneg_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i64 %in, -9223372036854775808 ret i64 %fneg } define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double %fadd = fadd double %bitcast, 2.0 store double %fadd, ptr addrspace(1) %out ret void } define double @v_fneg_i64_fp_use(i64 %in) { ; GCN-LABEL: v_fneg_i64_fp_use: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double %fadd = fadd double %bitcast, 2.0 ret double %fadd } define i16 @v_fneg_i16(i16 %in) { ; SI-LABEL: v_fneg_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_fneg_i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_fneg_i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fneg = xor i16 %in, -32768 ret i16 %fneg } define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_fneg_i16_fp_use: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, 2.0, s2 ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_fneg_i16_fp_use: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half %fadd = fadd half %bitcast, 2.0 store half %fadd, ptr addrspace(1) %out ret void } define half @v_fneg_i16_fp_use(i16 %in) { ; SI-LABEL: v_fneg_i16_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_i16_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_fneg_i16_fp_use: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.l, 2.0, v0.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_fneg_i16_fp_use: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_sub_f16_e32 v0, 2.0, v0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half %fadd = fadd half %bitcast, 2.0 ret half %fadd } define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_xor_b32 s4, s6, 0x80008000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, store <2 x i16> %fneg, ptr addrspace(1) %out ret void } define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { ; SI-LABEL: v_fneg_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg = xor <2 x i16> %in, ret <2 x i16> %fneg } define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; VI-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, %bitcast = bitcast <2 x i16> %fneg to <2 x half> %fadd = fadd <2 x half> %bitcast, store <2 x half> %fadd, ptr addrspace(1) %out ret void } define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) { ; SI-LABEL: v_fneg_v2i16_fp_use: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_v2i16_fp_use: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0x4000 ; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, %bitcast = bitcast <2 x i16> %fneg to <2 x half> %fadd = fadd <2 x half> %bitcast, ret <2 x half> %fadd }