aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/fma-combine.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/fma-combine.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma-combine.ll2013
1 files changed, 1199 insertions, 814 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index d7cf411..8fc6904 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,TAHITI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,VERDE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -648,53 +648,53 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; SI-FMA: ; %bb.0:
-; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-FMA-NEXT: s_mov_b32 s6, 0
-; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
-; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7]
-; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-FMA-NEXT: s_endpgm
+; SI-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -718,55 +718,55 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
}
define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
-; SI-NOFMA: ; %bb.0:
-; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT: s_mov_b32 s6, 0
-; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
-; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
-; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NOFMA-NEXT: s_endpgm
+; SI-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -792,53 +792,53 @@ define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace
; fold (fsub x, (fma y, z, (fmul u, v)))
; -> (fma (fneg y), z, (fma (fneg u), v, x))
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
-; SI-FMA: ; %bb.0:
-; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-FMA-NEXT: s_mov_b32 s6, 0
-; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
-; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3]
-; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3]
-; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-FMA-NEXT: s_endpgm
+; SI-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3]
+; SI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
-; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
+; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -862,55 +862,55 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
ret void
}
define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
-; SI-NOFMA: ; %bb.0:
-; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT: s_mov_b32 s6, 0
-; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
-; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
-; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NOFMA-NEXT: s_endpgm
+; SI-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -939,7 +939,58 @@ define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace
;
define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y:
+; SI-LABEL: test_f32_mul_add_x_one_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_add_x_one_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load volatile float, ptr addrspace(1) %in1
+ %y = load volatile float, ptr addrspace(1) %in2
+ %a = fadd contract float %x, 1.0
+ %m = fmul contract float %a, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_add_x_one_y_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -958,12 +1009,11 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_add_x_one_y:
+; SI-FMA-LABEL: test_f32_mul_add_x_one_y_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -986,49 +1036,83 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_add_x_one_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load volatile float, ptr addrspace(1) %in1
+ %y = load volatile float, ptr addrspace(1) %in2
+ %a = fadd contract ninf float %x, 1.0
+ %m = fmul contract ninf float %a, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_y_add_x_one:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_add_x_one:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
%y = load volatile float, ptr addrspace(1) %in2
%a = fadd contract float %x, 1.0
- %m = fmul contract float %a, %y
+ %m = fmul contract float %y, %a
store float %m, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one:
+define amdgpu_kernel void @test_f32_mul_y_add_x_one_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1047,12 +1131,11 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_y_add_x_one:
+; SI-FMA-LABEL: test_f32_mul_y_add_x_one_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1075,126 +1158,72 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_add_x_one_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
%y = load volatile float, ptr addrspace(1) %in2
- %a = fadd contract float %x, 1.0
- %m = fmul contract float %y, %a
+ %a = fadd contract ninf float %x, 1.0
+ %m = fmul contract ninf float %y, %a
store float %m, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
-; SI-NOFMA: ; %bb.0:
-; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT: s_mov_b32 s6, -1
-; SI-NOFMA-NEXT: s_mov_b32 s14, s6
-; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b32 s12, s2
-; SI-NOFMA-NEXT: s_mov_b32 s13, s3
-; SI-NOFMA-NEXT: s_mov_b32 s15, s7
-; SI-NOFMA-NEXT: s_mov_b32 s10, s6
-; SI-NOFMA-NEXT: s_mov_b32 s11, s7
-; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; SI-NOFMA-NEXT: s_mov_b32 s4, s0
-; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NOFMA-NEXT: s_endpgm
-;
-; SI-FMA-LABEL: test_f32_mul_add_x_negone_y:
-; SI-FMA: ; %bb.0:
-; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-FMA-NEXT: s_mov_b32 s6, -1
-; SI-FMA-NEXT: s_mov_b32 s14, s6
-; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-FMA-NEXT: s_mov_b32 s12, s2
-; SI-FMA-NEXT: s_mov_b32 s13, s3
-; SI-FMA-NEXT: s_mov_b32 s15, s7
-; SI-FMA-NEXT: s_mov_b32 s10, s6
-; SI-FMA-NEXT: s_mov_b32 s11, s7
-; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; SI-FMA-NEXT: s_mov_b32 s4, s0
-; SI-FMA-NEXT: s_mov_b32 s5, s1
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
-; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-FMA-NEXT: s_endpgm
-;
-; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; SI-LABEL: test_f32_mul_add_x_negone_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_add_x_negone_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
@@ -1205,8 +1234,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1223,14 +1252,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_y_add_x_negone:
+; SI-FMA-LABEL: test_f32_mul_add_x_negone_y_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1252,38 +1279,72 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_add_x_negone_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %a = fadd contract ninf float %x, -1.0
+ %m = fmul contract ninf float %a, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_y_add_x_negone:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_add_x_negone:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
@@ -1294,8 +1355,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1312,14 +1373,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_sub_one_x_y:
+; SI-FMA-LABEL: test_f32_mul_y_add_x_negone_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1337,42 +1396,76 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; SI-FMA-NEXT: s_mov_b32 s4, s0
; SI-FMA-NEXT: s_mov_b32 s5, s1
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_add_x_negone_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %a = fadd contract ninf float %x, -1.0
+ %m = fmul contract ninf float %y, %a
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_sub_one_x_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_sub_one_x_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
@@ -1383,85 +1476,100 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
-; SI-NOFMA: ; %bb.0:
-; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT: s_mov_b32 s6, -1
-; SI-NOFMA-NEXT: s_mov_b32 s14, s6
-; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b32 s12, s2
-; SI-NOFMA-NEXT: s_mov_b32 s13, s3
-; SI-NOFMA-NEXT: s_mov_b32 s15, s7
-; SI-NOFMA-NEXT: s_mov_b32 s10, s6
-; SI-NOFMA-NEXT: s_mov_b32 s11, s7
-; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; SI-NOFMA-NEXT: s_mov_b32 s4, s0
-; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NOFMA-NEXT: s_endpgm
-;
-; SI-FMA-LABEL: test_f32_mul_y_sub_one_x:
-; SI-FMA: ; %bb.0:
-; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-FMA-NEXT: s_mov_b32 s6, -1
-; SI-FMA-NEXT: s_mov_b32 s14, s6
-; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-FMA-NEXT: s_mov_b32 s12, s2
-; SI-FMA-NEXT: s_mov_b32 s13, s3
-; SI-FMA-NEXT: s_mov_b32 s15, s7
-; SI-FMA-NEXT: s_mov_b32 s10, s6
-; SI-FMA-NEXT: s_mov_b32 s11, s7
-; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; SI-FMA-NEXT: s_mov_b32 s4, s0
-; SI-FMA-NEXT: s_mov_b32 s5, s1
-; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
-; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-FMA-NEXT: s_endpgm
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y_ninf(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_sub_one_x_y_ninf:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_sub_one_x_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract ninf float 1.0, %x
+ %m = fmul contract ninf float %s, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_y_sub_one_x:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_one_x:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
@@ -1472,8 +1580,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1490,14 +1598,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: v_mad_f32 v0, -v0, v1, v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y:
+; SI-FMA-LABEL: test_f32_mul_y_sub_one_x_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1515,42 +1621,76 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; SI-FMA-NEXT: s_mov_b32 s4, s0
; SI-FMA-NEXT: s_mov_b32 s5, s1
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1
+; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_one_x_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v2, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract ninf float 1.0, %x
+ %m = fmul contract ninf float %y, %s
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_sub_negone_x_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_sub_negone_x_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
@@ -1561,8 +1701,112 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
ret void
}
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y_ninf(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_sub_negone_x_y_ninf:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_sub_negone_x_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract ninf float -1.0, %x
+ %m = fmul contract ninf float %s, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
+; SI-LABEL: test_f32_mul_y_sub_negone_x:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_y_sub_negone_x:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract float -1.0, %x
+ %m = fmul contract float %y, %s
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1579,14 +1823,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: v_mad_f32 v0, -v0, v1, -v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x:
+; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1608,50 +1850,84 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_negone_x_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v2, -v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub contract float -1.0, %x
- %m = fmul contract float %y, %s
+ %s = fsub contract ninf float -1.0, %x
+ %m = fmul contract ninf float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
+; SI-LABEL: test_f32_mul_sub_x_one_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_sub_x_one_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract float %x, 1.0
+ %m = fmul contract float %s, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1668,14 +1944,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_sub_x_one_y:
+; SI-FMA-LABEL: test_f32_mul_sub_x_one_y_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1697,50 +1971,84 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_sub_x_one_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub contract float %x, 1.0
- %m = fmul contract float %s, %y
+ %s = fsub contract ninf float %x, 1.0
+ %m = fmul contract ninf float %s, %y
store float %m, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
+; SI-LABEL: test_f32_mul_y_sub_x_one:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_y_sub_x_one:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract float %x, 1.0
+ %m = fmul contract float %y, %s
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1757,14 +2065,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1
; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_y_sub_x_one:
+; SI-FMA-LABEL: test_f32_mul_y_sub_x_one_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1786,50 +2092,84 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_x_one_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub contract float %x, 1.0
- %m = fmul contract float %y, %s
+ %s = fsub contract ninf float %x, 1.0
+ %m = fmul contract ninf float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
+; SI-LABEL: test_f32_mul_sub_x_negone_y:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_f32_mul_sub_x_negone_y:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract float %x, -1.0
+ %m = fmul contract float %s, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1846,14 +2186,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y:
+; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1875,50 +2213,84 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_sub_x_negone_y_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_endpgm
+ ptr addrspace(1) %in1,
+ ptr addrspace(1) %in2) {
+ %x = load float, ptr addrspace(1) %in1
+ %y = load float, ptr addrspace(1) %in2
+ %s = fsub contract ninf float %x, -1.0
+ %m = fmul contract ninf float %s, %y
+ store float %m, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
+; SI-LABEL: test_f32_mul_y_sub_x_negone:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s14, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
;
-; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_x_negone:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
%s = fsub contract float %x, -1.0
- %m = fmul contract float %s, %y
+ %m = fmul contract float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
-; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone_ninf(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone_ninf:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1935,14 +2307,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
; SI-NOFMA-NEXT: s_mov_b32 s4, s0
; SI-NOFMA-NEXT: s_mov_b32 s5, s1
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
-; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NOFMA-NEXT: s_endpgm
;
-; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone:
+; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone_ninf:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
@@ -1964,44 +2334,26 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x1
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_mul_y_sub_x_negone_ninf:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub contract float %x, -1.0
- %m = fmul contract float %y, %s
+ %s = fsub contract ninf float %x, -1.0
+ %m = fmul contract ninf float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
@@ -2048,66 +2400,51 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-FMA-NEXT: s_mov_b32 s11, 0xf000
; SI-FMA-NEXT: s_mov_b32 s10, -1
-; SI-FMA-NEXT: s_mov_b32 s18, s10
-; SI-FMA-NEXT: s_mov_b32 s19, s11
+; SI-FMA-NEXT: s_mov_b32 s14, s10
+; SI-FMA-NEXT: s_mov_b32 s15, s11
; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
; SI-FMA-NEXT: s_mov_b32 s16, s4
; SI-FMA-NEXT: s_mov_b32 s17, s5
-; SI-FMA-NEXT: s_mov_b32 s14, s10
-; SI-FMA-NEXT: s_mov_b32 s12, s2
-; SI-FMA-NEXT: s_mov_b32 s13, s3
-; SI-FMA-NEXT: s_mov_b32 s15, s11
; SI-FMA-NEXT: s_mov_b32 s4, s6
; SI-FMA-NEXT: s_mov_b32 s5, s7
; SI-FMA-NEXT: s_mov_b32 s6, s10
; SI-FMA-NEXT: s_mov_b32 s7, s11
-; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-FMA-NEXT: s_mov_b32 s12, s2
+; SI-FMA-NEXT: s_mov_b32 s13, s3
+; SI-FMA-NEXT: s_mov_b32 s18, s10
+; SI-FMA-NEXT: s_mov_b32 s19, s11
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
; SI-FMA-NEXT: s_mov_b32 s8, s0
; SI-FMA-NEXT: s_mov_b32 s9, s1
+; SI-FMA-NEXT: s_waitcnt vmcnt(2)
+; SI-FMA-NEXT: v_sub_f32_e32 v3, 1.0, v0
; SI-FMA-NEXT: s_waitcnt vmcnt(1)
-; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0
+; SI-FMA-NEXT: v_mul_f32_e32 v1, v1, v3
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0
+; SI-FMA-NEXT: v_fma_f32 v0, v2, v0, v1
; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f32_interp:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f32_interp:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f32_interp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_sub_f32_e32 v4, 1.0, v1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2,
ptr addrspace(1) %in3) {
@@ -2123,6 +2460,66 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
}
define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
+; TAHITI-LABEL: test_f64_interp:
+; TAHITI: ; %bb.0:
+; TAHITI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; TAHITI-NEXT: s_mov_b32 s11, 0xf000
+; TAHITI-NEXT: s_mov_b32 s10, -1
+; TAHITI-NEXT: s_mov_b32 s18, s10
+; TAHITI-NEXT: s_mov_b32 s19, s11
+; TAHITI-NEXT: s_waitcnt lgkmcnt(0)
+; TAHITI-NEXT: s_mov_b32 s16, s4
+; TAHITI-NEXT: s_mov_b32 s17, s5
+; TAHITI-NEXT: s_mov_b32 s4, s6
+; TAHITI-NEXT: s_mov_b32 s5, s7
+; TAHITI-NEXT: s_mov_b32 s6, s10
+; TAHITI-NEXT: s_mov_b32 s7, s11
+; TAHITI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; TAHITI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; TAHITI-NEXT: s_mov_b32 s14, s10
+; TAHITI-NEXT: s_mov_b32 s12, s2
+; TAHITI-NEXT: s_mov_b32 s13, s3
+; TAHITI-NEXT: s_mov_b32 s15, s11
+; TAHITI-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0
+; TAHITI-NEXT: s_mov_b32 s8, s0
+; TAHITI-NEXT: s_mov_b32 s9, s1
+; TAHITI-NEXT: s_waitcnt vmcnt(2)
+; TAHITI-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
+; TAHITI-NEXT: s_waitcnt vmcnt(1)
+; TAHITI-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
+; TAHITI-NEXT: s_waitcnt vmcnt(0)
+; TAHITI-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; TAHITI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; TAHITI-NEXT: s_endpgm
+;
+; VERDE-LABEL: test_f64_interp:
+; VERDE: ; %bb.0:
+; VERDE-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; VERDE-NEXT: s_mov_b32 s11, 0xf000
+; VERDE-NEXT: s_mov_b32 s10, -1
+; VERDE-NEXT: s_mov_b32 s14, s10
+; VERDE-NEXT: s_mov_b32 s15, s11
+; VERDE-NEXT: s_waitcnt lgkmcnt(0)
+; VERDE-NEXT: s_mov_b32 s12, s6
+; VERDE-NEXT: s_mov_b32 s13, s7
+; VERDE-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VERDE-NEXT: s_mov_b32 s6, s10
+; VERDE-NEXT: s_mov_b32 s7, s11
+; VERDE-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; VERDE-NEXT: s_mov_b32 s4, s2
+; VERDE-NEXT: s_mov_b32 s5, s3
+; VERDE-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; VERDE-NEXT: s_mov_b32 s8, s0
+; VERDE-NEXT: s_mov_b32 s9, s1
+; VERDE-NEXT: s_waitcnt vmcnt(2)
+; VERDE-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
+; VERDE-NEXT: s_waitcnt vmcnt(1)
+; VERDE-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
+; VERDE-NEXT: s_waitcnt vmcnt(0)
+; VERDE-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VERDE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VERDE-NEXT: s_endpgm
+;
; SI-FMA-LABEL: test_f64_interp:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2137,8 +2534,8 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
; SI-FMA-NEXT: s_mov_b32 s5, s7
; SI-FMA-NEXT: s_mov_b32 s6, s10
; SI-FMA-NEXT: s_mov_b32 s7, s11
-; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
-; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
; SI-FMA-NEXT: s_mov_b32 s14, s10
; SI-FMA-NEXT: s_mov_b32 s12, s2
; SI-FMA-NEXT: s_mov_b32 s13, s3
@@ -2146,48 +2543,33 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0
; SI-FMA-NEXT: s_mov_b32 s8, s0
; SI-FMA-NEXT: s_mov_b32 s9, s1
+; SI-FMA-NEXT: s_waitcnt vmcnt(2)
+; SI-FMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
; SI-FMA-NEXT: s_waitcnt vmcnt(1)
-; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
+; SI-FMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; SI-FMA-NEXT: s_waitcnt vmcnt(0)
-; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: test_f64_interp:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
-; GFX11-FMA-LABEL: test_f64_interp:
-; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
-; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
-; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-FMA-NEXT: s_endpgm
+; GFX11-LABEL: test_f64_interp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7]
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[4:5]
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2,
ptr addrspace(1) %in3) {
@@ -2356,3 +2738,6 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FMA: {{.*}}
+; GFX11-NOFMA: {{.*}}