diff options
author | Changpeng Fang <changpeng.fang@amd.com> | 2025-07-23 10:26:50 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-23 10:26:50 -0700 |
commit | 81185f7a2bd1a023ab7046a8adb4d132fdbd7ffd (patch) | |
tree | 0471aeea1c74028fca66e3d10a0893db24a0fb0c | |
parent | 32c985485500b214d57cb25306734eb73833d59b (diff) | |
download | llvm-81185f7a2bd1a023ab7046a8adb4d132fdbd7ffd.zip llvm-81185f7a2bd1a023ab7046a8adb4d132fdbd7ffd.tar.gz llvm-81185f7a2bd1a023ab7046a8adb4d132fdbd7ffd.tar.bz2 |
AMDGPU: Add packed fp32 instructions for gfx1250 (#150253)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 1216 | ||||
-rw-r--r-- | llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s | 244 | ||||
-rw-r--r-- | llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt | 141 |
4 files changed, 1605 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index a259e5c..ed8e547 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -2210,6 +2210,10 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; +defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>; +defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>; +defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>; + defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>; defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>; defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>; diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 0e1e5e4..1663994 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -4,6 +4,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-GISEL %s define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ; GFX900-LABEL: fadd_v2_vv: @@ -29,6 +31,17 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fadd_v2_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -61,6 +74,17 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fadd_v2_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -112,6 +136,34 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v4_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v4_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -277,6 +329,115 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v32_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9] +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v32_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15] +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -325,6 +486,32 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_imm: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_imm: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -370,6 +557,30 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -419,6 +630,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -452,6 +688,29 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -486,6 +745,18 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fadd_v2_v_lit_lo0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000) +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -520,6 +791,18 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fadd_v2_v_unfoldable_lit: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -570,6 +853,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_fneg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_fneg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -622,6 +930,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s2 +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -674,6 +1007,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -723,6 +1081,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -772,6 +1155,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -807,6 +1215,17 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fmul_v2_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -839,6 +1258,17 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fmul_v2_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -890,6 +1320,34 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v4_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v4_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -1055,6 +1513,115 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v32_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[12:13] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[14:15] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[10:11] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[16:17] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[40:41] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[38:39] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[48:49] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[44:45] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[46:47] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[50:51] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[36:37] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[42:43] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[18:19] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[20:21] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[22:23] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[8:9] +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v32_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[16:17] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[18:19] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[20:21] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[22:23] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[24:25] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[26:27] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[28:29] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[30:31] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[4:5] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[8:9] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[10:11] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[12:13] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[14:15] +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -1102,6 +1669,32 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v2_v_imm: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_v_imm: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1147,6 +1740,30 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v2_v_v_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_v_v_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1196,6 +1813,31 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v2_v_lit_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1230,6 +1872,18 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fmul_v2_v_unfoldable_lit: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1279,6 +1933,31 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fmul_v2_v_fneg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_v_fneg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1314,6 +1993,17 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { ; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fma_v2_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1346,6 +2036,17 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm +; +; GFX1250-LABEL: fma_v2_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1397,6 +2098,34 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v4_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v4_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -1562,6 +2291,115 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v32_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9] +; GFX1250-SDAG-NEXT: s_clause 0x7 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v32_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[16:17], s[16:17] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[18:19], s[18:19] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[20:21], s[20:21] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[22:23], s[22:23] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[24:25], s[24:25] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[26:27], s[26:27] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[28:29], s[28:29] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[30:31], s[30:31] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[4:5], s[4:5] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[6:7], s[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[8:9], s[8:9] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[10:11], s[10:11] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[12:13], s[12:13] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[14:15], s[14:15] +; GFX1250-GISEL-NEXT: s_clause 0x7 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35] +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -1632,6 +2470,36 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] ; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX942-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v2_v_imm: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x43480000 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_mov_b32 s4, 0x42c80000 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3] op_sel_hi:[1,0,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v2_v_imm: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000 +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1677,6 +2545,30 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v2_v_v_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0] +; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v2_v_v_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1746,6 +2638,33 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] ; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX942-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v2_v_lit_splat: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1817,6 +2736,34 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] ; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX942-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[4:5], lit64(0x4040000040800000) +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000) +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1866,6 +2813,31 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_v2_v_fneg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_v2_v_fneg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1922,6 +2894,35 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX1250-SDAG-NEXT: ds_load_b32 v2, v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 +; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX1250-GISEL-NEXT: ds_load_b32 v2, v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -v2, -v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4 %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4 @@ -1986,6 +2987,38 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v2, s2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX1250-SDAG-NEXT: ds_load_b32 v4, v5 +; GFX1250-SDAG-NEXT: ds_load_b32 v5, v5 offset:8 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX1250-GISEL-NEXT: ds_load_b32 v4, v5 +; GFX1250-GISEL-NEXT: ds_load_b32 v5, v5 offset:8 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2 @@ -2048,6 +3081,31 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace( ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: shuffle_add_f32: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2 +; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: shuffle_add_f32: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -2111,6 +3169,39 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: shuffle_neg_add_f32: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2 +; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ds_load_b32 v3, v0 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: shuffle_neg_add_f32: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ds_load_b32 v3, v0 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -2174,6 +3265,30 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX942-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_fadd_fsub_0: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v0, v1 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> @@ -2248,6 +3363,39 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_fadd_fsub: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_f32 s6, s1, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], s[6:7] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_fadd_fsub: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], s[2:3] +; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_2) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v2, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] +; GFX1250-GISEL-NEXT: v_dual_subrev_f32 v3, s3, v0 :: v_dual_mov_b32 v0, 0 +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX1250-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> @@ -2300,6 +3448,32 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { ; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5] ; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fadd_shuffle_v4: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0] +; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_shuffle_v4: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v6, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5] +; GFX1250-GISEL-NEXT: global_store_b128 v6, v[0:3], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid @@ -2346,6 +3520,28 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fneg_v2f32_vec: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fneg_v2f32_vec: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -2387,6 +3583,26 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm +; +; GFX1250-SDAG-LABEL: fneg_v2f32_scalar: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX1250-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fneg_v2f32_scalar: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_endpgm %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x store <2 x float> %fneg, ptr addrspace(1) %a, align 8 ret void diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s index 69d46a6..4fd3d0d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s @@ -2,6 +2,250 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,0,0] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x1f,0xcc,0x00,0x05,0x12,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x1f,0xcc,0x00,0x05,0x12,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,1,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x1f,0xcc,0x00,0x05,0x12,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x3c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x5c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x9c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] clamp +// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] clamp ; encoding: [0x08,0xc0,0x1f,0xcc,0x00,0x05,0x12,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] +// GFX1250: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 +// GFX1250: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 ; encoding: [0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[254:255], v[8:9], v[16:17] +// GFX1250: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[254:255], v[16:17] +// GFX1250: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[2:3], v[16:17] +// GFX1250: v_pk_mul_f32 v[4:5], v[2:3], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x02,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[100:101], v[16:17] +// GFX1250: v_pk_mul_f32 v[4:5], v[100:101], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x64,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[254:255] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[2:3] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[2:3] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x05,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[100:101] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[100:101] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xc9,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp +// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_mul_f32 v[0:1], v[2:3], 1.0 +// GFX1250: v_pk_mul_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[254:255], v[8:9], v[16:17] +// GFX1250: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[254:255], v[16:17] +// GFX1250: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[2:3], v[16:17] +// GFX1250: v_pk_add_f32 v[4:5], v[2:3], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x02,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[100:101], v[16:17] +// GFX1250: v_pk_add_f32 v[4:5], v[100:101], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x64,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[254:255] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[2:3] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[2:3] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x05,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[100:101] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[100:101] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xc9,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp +// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_add_f32 v[0:1], v[2:3], 1.0 +// GFX1250: v_pk_add_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_pk_add_min_i16 v10, v1, v2, v3 // GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x1c] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt index ae3140d..5f42490 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt @@ -1,5 +1,146 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s +# GFX1250: v_pk_add_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18] +0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18 + +# GFX1250: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18] +0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x7e,0x20,0x02,0x18] +0x04,0x40,0x29,0xcc,0x7e,0x20,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18] +0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x00,0x18] +0x04,0x40,0x29,0xcc,0x08,0xfd,0x00,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18] +0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08] +0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18] +0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xd5,0x00,0x18] +0x04,0x40,0x29,0xcc,0x08,0xd5,0x00,0x18 + +# GFX1250: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x6a,0x20,0x02,0x18] +0x04,0x40,0x29,0xcc,0x6a,0x20,0x02,0x18 + +# GFX1250: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 ; encoding: [0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b] +0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b + +# GFX1250: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c] +0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c + +# GFX1250: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x1f,0xcc,0x00,0x01,0x10,0x04] +0x08,0x60,0x1f,0xcc,0x00,0x01,0x10,0x04 + +# GFX1250: v_pk_mul_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18] +0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18 + +# GFX1250: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18] +0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x7e,0x20,0x02,0x18] +0x04,0x40,0x28,0xcc,0x7e,0x20,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18] +0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x00,0x18] +0x04,0x40,0x28,0xcc,0x08,0xfd,0x00,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18] +0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08] +0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18] +0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xd5,0x00,0x18] +0x04,0x40,0x28,0xcc,0x08,0xd5,0x00,0x18 + +# GFX1250: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x6a,0x20,0x02,0x18] +0x04,0x40,0x28,0xcc,0x6a,0x20,0x02,0x18 + # GFX1250: v_pk_add_min_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00] 0x0a,0x40,0x2d,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00 |