diff options
author | Brox Chen <guochen2@amd.com> | 2025-06-10 15:36:44 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-06-10 15:36:44 -0400 |
commit | e48731bc03419f133a85b50571a368a889c6dab2 (patch) | |
tree | 32e22f32f7868840fc73307f9468b45318a3a28e | |
parent | 8345d62478054d4ab97c6f28cfea6d1ecca837da (diff) | |
download | llvm-e48731bc03419f133a85b50571a368a889c6dab2.zip llvm-e48731bc03419f133a85b50571a368a889c6dab2.tar.gz llvm-e48731bc03419f133a85b50571a368a889c6dab2.tar.bz2 |
[AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling in movetoVALU process (#141152)
Add op_sel for v_s_xxx_f16 when move them to VALU
update a few related codegen test for gfx12 in true16 mode
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/frem.ll | 893 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll | 101 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll | 101 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 88 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll | 113 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll | 123 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll | 112 |
8 files changed, 1425 insertions, 129 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 805f8e9..2ebf8b9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7734,6 +7734,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; } + case AMDGPU::V_S_EXP_F16_e64: + case AMDGPU::V_S_LOG_F16_e64: + case AMDGPU::V_S_RCP_F16_e64: + case AMDGPU::V_S_RSQ_F16_e64: + case AMDGPU::V_S_SQRT_F16_e64: { + const DebugLoc &DL = Inst.getDebugLoc(); + Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() + ? &AMDGPU::VGPR_16RegClass + : &AMDGPU::VGPR_32RegClass); + auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .add(Inst.getOperand(2)) + .addImm(0) // clamp + .addImm(0); // omod + if (ST.useRealTrue16Insts()) + NewInstr.addImm(0); // opsel0 + MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + legalizeOperandsVALUt16(*NewInstr, MRI); + legalizeOperands(*NewInstr, MDT); + addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); + Inst.eraseFromParent(); + return; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 7a13511..d3432da 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -8,6 +8,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: @@ -331,6 +333,82 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm +; +; GFX1200-TRUE16-LABEL: frem_f16: +; GFX1200-TRUE16: ; %bb.0: +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: s_endpgm +; +; GFX1200-FAKE16-LABEL: frem_f16: +; GFX1200-FAKE16: ; %bb.0: +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -537,6 +615,48 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm +; +; GFX1200-TRUE16-LABEL: fast_frem_f16: +; GFX1200-TRUE16: ; %bb.0: +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: s_endpgm +; +; GFX1200-FAKE16-LABEL: fast_frem_f16: +; GFX1200-FAKE16: ; %bb.0: +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -743,6 +863,48 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm +; +; GFX1200-TRUE16-LABEL: unsafe_frem_f16: +; GFX1200-TRUE16: ; %bb.0: +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: s_endpgm +; +; GFX1200-FAKE16-LABEL: unsafe_frem_f16: +; GFX1200-FAKE16: ; %bb.0: +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -985,6 +1147,42 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v5, v4 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX1200-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v1 +; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -1142,6 +1340,27 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: fast_frem_f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -1299,6 +1518,27 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: unsafe_frem_f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -1551,6 +1791,39 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_f64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v12, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -1772,6 +2045,35 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: fast_frem_f64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v10, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -1993,6 +2295,35 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: unsafe_frem_f64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v10, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -2514,6 +2845,131 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 ; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm +; +; GFX1200-TRUE16-LABEL: frem_v2f16: +; GFX1200-TRUE16: ; %bb.0: +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l +; GFX1200-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1200-TRUE16-NEXT: s_endpgm +; +; GFX1200-FAKE16-LABEL: frem_v2f16: +; GFX1200-FAKE16: ; %bb.0: +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1200-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX1200-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 @@ -3398,6 +3854,227 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm +; +; GFX1200-TRUE16-LABEL: frem_v4f16: +; GFX1200-TRUE16: ; %bb.0: +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, 0 +; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-TRUE16-NEXT: s_clause 0x1 +; GFX1200-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX1200-TRUE16-NEXT: s_endpgm +; +; GFX1200-FAKE16-LABEL: frem_v4f16: +; GFX1200-FAKE16: ; %bb.0: +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-FAKE16-NEXT: s_clause 0x1 +; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1200-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0 +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1200-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2 +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3 +; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX1200-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 @@ -3758,6 +4435,65 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 +; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX1200-NEXT: v_trunc_f32_e32 v5, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-NEXT: v_fma_f32 v1, v5, v3, v1 +; GFX1200-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 +; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v0, v3, v2 +; GFX1200-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8 @@ -4354,6 +5090,111 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v4f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v8, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX1200-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 +; GFX1200-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v11, v10 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX1200-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v9, v9, v7, v3 +; GFX1200-NEXT: v_trunc_f32_e32 v9, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 +; GFX1200-NEXT: v_fma_f32 v3, v9, v7, v3 +; GFX1200-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 +; GFX1200-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v10, v9 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX1200-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v7, v7, v6, v2 +; GFX1200-NEXT: v_trunc_f32_e32 v7, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX1200-NEXT: v_fma_f32 v2, v7, v6, v2 +; GFX1200-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 +; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v9, v7 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1200-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v5, v1 +; GFX1200-NEXT: v_trunc_f32_e32 v6, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-NEXT: v_fma_f32 v1, v6, v5, v1 +; GFX1200-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 +; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v7, v9, v7 +; GFX1200-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v4, v0 +; GFX1200-NEXT: v_trunc_f32_e32 v5, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX1200-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16 @@ -4734,6 +5575,58 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_v2f64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v16, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b128 v[0:3], v16, s[2:3] +; GFX1200-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX1200-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX1200-NEXT: v_mul_f64_e32 v[14:15], v[12:13], v[10:11] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX1200-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] +; GFX1200-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX1200-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] +; GFX1200-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1200-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[10:11], v[8:9] +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; GFX1200-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] +; GFX1200-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] +; GFX1200-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll index c6ea12d..a2be749 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll @@ -1,17 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s declare half @llvm.amdgcn.rcp.f16(half %a) -; GCN-LABEL: {{^}}rcp_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] -; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l -; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @rcp_f16( +; GCN-LABEL: rcp_f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_rcp_f16_e32 v0, v0 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: rcp_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: rcp_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: rcp_f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: rcp_f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -20,3 +105,5 @@ entry: store half %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll index 0924e9a..bf37147 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll @@ -1,17 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s declare half @llvm.amdgcn.rsq.f16(half %a) -; GCN-LABEL: {{^}}rsq_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] -; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l -; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @rsq_f16( +; GCN-LABEL: rsq_f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_rsq_f16_e32 v0, v0 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: rsq_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: rsq_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: rsq_f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: rsq_f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -20,3 +105,5 @@ entry: store half %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 2996a4e..8604feb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -81,6 +83,42 @@ define amdgpu_kernel void @sqrt_f16( ; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: sqrt_f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sqrt_f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -189,6 +227,50 @@ define amdgpu_kernel void @sqrt_v2f16( ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: sqrt_v2f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sqrt_v2f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -197,5 +279,3 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll new file mode 100644 index 0000000..a681935 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: exp_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.exp2.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: log_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.log.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: rcp_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.rcp.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: rsq_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.rsq.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: sqrt_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.sqrt.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +declare half @llvm.amdgcn.exp2.f16(half) +declare half @llvm.amdgcn.log.f16(half) +declare half @llvm.amdgcn.rcp.f16(half) +declare half @llvm.amdgcn.rsq.f16(half) +declare half @llvm.amdgcn.sqrt.f16(half) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll new file mode 100644 index 0000000..b1b5b6b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: exp_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.exp2.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: log_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.log.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: rcp_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.rcp.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: rsq_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.rsq.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: sqrt_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %val = load volatile half, ptr addrspace(1) %ptr + %res = call half @llvm.amdgcn.sqrt.f16(half %val) + store half %res, ptr addrspace(1) %ptr + ret void +} + +declare half @llvm.amdgcn.exp2.f16(half) +declare half @llvm.amdgcn.log.f16(half) +declare half @llvm.amdgcn.rcp.f16(half) +declare half @llvm.amdgcn.rsq.f16(half) +declare half @llvm.amdgcn.sqrt.f16(half) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll index 9407c8a..56848ea 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { @@ -21,27 +21,6 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { ret void } -define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { - ; CHECK-LABEL: name: exp_f16 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]] - ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %val = load volatile half, ptr addrspace(1) %ptr - %res = call half @llvm.amdgcn.exp2.f16(half %val) - store half %res, ptr addrspace(1) %ptr - ret void -} - define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f32 ; CHECK: bb.0 (%ir-block.0): @@ -62,27 +41,6 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ret void } -define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { - ; CHECK-LABEL: name: log_f16 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]] - ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %val = load volatile half, ptr addrspace(1) %ptr - %res = call half @llvm.amdgcn.log.f16(half %val) - store half %res, ptr addrspace(1) %ptr - ret void -} - define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f32 ; CHECK: bb.0 (%ir-block.0): @@ -103,27 +61,6 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ret void } -define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { - ; CHECK-LABEL: name: rcp_f16 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]] - ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %val = load volatile half, ptr addrspace(1) %ptr - %res = call half @llvm.amdgcn.rcp.f16(half %val) - store half %res, ptr addrspace(1) %ptr - ret void -} - define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f32 ; CHECK: bb.0 (%ir-block.0): @@ -144,27 +81,6 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ret void } -define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { - ; CHECK-LABEL: name: rsq_f16 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]] - ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %val = load volatile half, ptr addrspace(1) %ptr - %res = call half @llvm.amdgcn.rsq.f16(half %val) - store half %res, ptr addrspace(1) %ptr - ret void -} - define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f32 ; CHECK: bb.0 (%ir-block.0): @@ -185,34 +101,8 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ret void } -define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { - ; CHECK-LABEL: name: sqrt_f16 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]] - ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %val = load volatile half, ptr addrspace(1) %ptr - %res = call half @llvm.amdgcn.sqrt.f16(half %val) - store half %res, ptr addrspace(1) %ptr - ret void -} - declare float @llvm.amdgcn.exp2.f32(float) -declare half @llvm.amdgcn.exp2.f16(half) declare float @llvm.amdgcn.log.f32(float) -declare half @llvm.amdgcn.log.f16(half) declare float @llvm.amdgcn.rcp.f32(float) -declare half @llvm.amdgcn.rcp.f16(half) declare float @llvm.amdgcn.rsq.f32(float) -declare half @llvm.amdgcn.rsq.f16(half) declare float @llvm.amdgcn.sqrt.f32(float) -declare half @llvm.amdgcn.sqrt.f16(half) |