diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/fpext.f16.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 112 |
1 files changed, 56 insertions, 56 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index d41e2c6..fc7d87e 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: @@ -53,7 +53,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -136,7 +136,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -184,12 +184,12 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -465,7 +465,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l @@ -547,7 +547,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| @@ -629,7 +629,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| @@ -681,7 +681,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -722,16 +722,16 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v2, -v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v2, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -825,14 +825,14 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.l, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -925,16 +925,16 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v2, |v0.l| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v2, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -985,13 +985,13 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; @@ -1028,14 +1028,14 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, |v0.l|, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l| +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1128,16 +1128,16 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v2, -|v0.l| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v2, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1189,10 +1189,10 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 @@ -1232,14 +1232,14 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, -|v0.l|, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| -; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l| +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, -|v0.l|, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; |
