; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s ; Test that tree-structured min/max reductions form min3/max3 efficiently. ; The key pattern is op(op(a,b), op(c,d)) which should become ; op(op3(a,b,c), d) to enable further combining at higher tree levels. ; Basic 4-value tree: maxnum f32 define float @v_max3_maxnum_tree4_f32(float %a, float %b, float %c, float %d) { ; GFX9-LABEL: v_max3_maxnum_tree4_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_max3_maxnum_tree4_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %max.ab = call float @llvm.maxnum.f32(float %a, float %b) %max.cd = call float @llvm.maxnum.f32(float %c, float %d) %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) ret float %result } ; 8-value tree: maxnum f32 define float @v_max3_maxnum_tree8_f32(float %a, float %b, float %c, float %d, ; GFX9-LABEL: v_max3_maxnum_tree8_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_max3_f32 v1, v4, v5, v6 ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_max3_maxnum_tree8_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 ; GFX1250-NEXT: v_max3_num_f32 v1, v4, v5, v6 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v7 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] float %e, float %f, float %g, float %h) { %ab = call float @llvm.maxnum.f32(float %a, float %b) %cd = call float @llvm.maxnum.f32(float %c, float %d) %ef = call float @llvm.maxnum.f32(float %e, float %f) %gh = call float @llvm.maxnum.f32(float %g, float %h) %abcd = call float @llvm.maxnum.f32(float %ab, float %cd) %efgh = call float @llvm.maxnum.f32(float %ef, float %gh) %result = call float @llvm.maxnum.f32(float %abcd, float %efgh) ret float %result } ; Basic 4-value tree: maximum f32 (IEEE 2019) define float @v_maximum3_tree4_f32(float %a, float %b, float %c, float %d) { ; GFX9-LABEL: v_maximum3_tree4_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_maximum3_tree4_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %max.ab = call float @llvm.maximum.f32(float %a, float %b) %max.cd = call float @llvm.maximum.f32(float %c, float %d) %result = call float @llvm.maximum.f32(float %max.ab, float %max.cd) ret float %result } ; 8-value tree: maximum f32 (IEEE 2019) define float @v_maximum3_tree8_f32(float %a, float %b, float %c, float %d, ; GFX9-LABEL: v_maximum3_tree8_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_maximum3_tree8_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: v_maximum3_f32 v1, v4, v5, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v7 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] float %e, float %f, float %g, float %h) { %ab = call float @llvm.maximum.f32(float %a, float %b) %cd = call float @llvm.maximum.f32(float %c, float %d) %ef = call float @llvm.maximum.f32(float %e, float %f) %gh = call float @llvm.maximum.f32(float %g, float %h) %abcd = call float @llvm.maximum.f32(float %ab, float %cd) %efgh = call float @llvm.maximum.f32(float %ef, float %gh) %result = call float @llvm.maximum.f32(float %abcd, float %efgh) ret float %result } ; Basic 4-value tree: minimum f32 (IEEE 2019) define float @v_minimum3_tree4_f32(float %a, float %b, float %c, float %d) { ; GFX9-LABEL: v_minimum3_tree4_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v4, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_minimum3_tree4_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_minimum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_minimum_f32 v0, v0, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %min.ab = call float @llvm.minimum.f32(float %a, float %b) %min.cd = call float @llvm.minimum.f32(float %c, float %d) %result = call float @llvm.minimum.f32(float %min.ab, float %min.cd) ret float %result } ; Basic 4-value tree: minnum f32 define float @v_min3_minnum_tree4_f32(float %a, float %b, float %c, float %d) { ; GFX9-LABEL: v_min3_minnum_tree4_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_min3_minnum_tree4_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: v_max_num_f32_e32 v1, v3, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %min.ab = call float @llvm.minnum.f32(float %a, float %b) %min.cd = call float @llvm.minnum.f32(float %c, float %d) %result = call float @llvm.minnum.f32(float %min.ab, float %min.cd) ret float %result } ; 16-value tree: maximum f32, tests 3 levels of deferral define float @v_maximum3_tree16_f32(float %a, float %b, float %c, float %d, ; GFX9-LABEL: v_maximum3_tree16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v16, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v8, v9 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v10, v11 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v6, v12, v13 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v7, v14, v15 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc ; GFX9-NEXT: v_max_f32_e32 v8, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v8, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_maximum3_tree16_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: v_maximum3_f32 v1, v8, v9, v10 ; GFX1250-NEXT: v_maximum3_f32 v2, v4, v5, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_maximum_f32 v0, v0, v3 ; GFX1250-NEXT: v_maximum_f32 v1, v1, v11 ; GFX1250-NEXT: v_maximum3_f32 v3, v12, v13, v14 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v2, v7 ; GFX1250-NEXT: v_maximum3_f32 v1, v1, v3, v15 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_maximum_f32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l, float %m, float %n, float %o, float %p) { %ab = call float @llvm.maximum.f32(float %a, float %b) %cd = call float @llvm.maximum.f32(float %c, float %d) %ef = call float @llvm.maximum.f32(float %e, float %f) %gh = call float @llvm.maximum.f32(float %g, float %h) %ij = call float @llvm.maximum.f32(float %i, float %j) %kl = call float @llvm.maximum.f32(float %k, float %l) %mn = call float @llvm.maximum.f32(float %m, float %n) %op = call float @llvm.maximum.f32(float %o, float %p) %abcd = call float @llvm.maximum.f32(float %ab, float %cd) %efgh = call float @llvm.maximum.f32(float %ef, float %gh) %ijkl = call float @llvm.maximum.f32(float %ij, float %kl) %mnop = call float @llvm.maximum.f32(float %mn, float %op) %abcdefgh = call float @llvm.maximum.f32(float %abcd, float %efgh) %ijklmnop = call float @llvm.maximum.f32(float %ijkl, float %mnop) %result = call float @llvm.maximum.f32(float %abcdefgh, float %ijklmnop) ret float %result } ; Unbalanced tree: left side is tree, right side is leaf define float @v_maximum3_tree_unbalanced_f32(float %a, float %b, float %c, float %d, float %e) { ; GFX9-LABEL: v_maximum3_tree_unbalanced_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_maximum3_tree_unbalanced_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ab = call float @llvm.maximum.f32(float %a, float %b) %cd = call float @llvm.maximum.f32(float %c, float %d) %abcd = call float @llvm.maximum.f32(float %ab, float %cd) %result = call float @llvm.maximum.f32(float %abcd, float %e) ret float %result } ; Multi-use: one side has multiple uses, should NOT trigger tree combine define float @v_max3_maxnum_tree4_multi_use(float %a, float %b, float %c, float %d, ptr addrspace(1) %out) { ; GFX9-LABEL: v_max3_maxnum_tree4_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: global_store_dword v[4:5], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_max3_maxnum_tree4_multi_use: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v2, v2, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v3 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %max.ab = call float @llvm.maxnum.f32(float %a, float %b) %max.cd = call float @llvm.maxnum.f32(float %c, float %d) %result = call float @llvm.maxnum.f32(float %max.ab, float %max.cd) store float %max.cd, ptr addrspace(1) %out ret float %result } ; 8-value tree: left subtree single-use, right subtree multi-use. ; Left subtree should be tree-combined. Right subtree can't (multi-use), ; so existing combine absorbs it. Tests asymmetric deferral behavior. define float @v_maximum3_tree8_asymmetric_use(float %a, float %b, float %c, float %d, ; GFX9-LABEL: v_maximum3_tree8_asymmetric_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v10, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v11, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v10, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v11, v2, vcc ; GFX9-NEXT: global_store_dword v[8:9], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_maximum3_tree8_asymmetric_use: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_maximum3_f32 v4, v4, v5, v6 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_maximum_f32 v1, v4, v7 ; GFX1250-NEXT: v_maximum3_f32 v0, v0, v3, v1 ; GFX1250-NEXT: global_store_b32 v[8:9], v1, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] float %e, float %f, float %g, float %h, ptr addrspace(1) %out) { %ab = call float @llvm.maximum.f32(float %a, float %b) %cd = call float @llvm.maximum.f32(float %c, float %d) %ef = call float @llvm.maximum.f32(float %e, float %f) %gh = call float @llvm.maximum.f32(float %g, float %h) %abcd = call float @llvm.maximum.f32(float %ab, float %cd) %efgh = call float @llvm.maximum.f32(float %ef, float %gh) %result = call float @llvm.maximum.f32(float %abcd, float %efgh) store float %efgh, ptr addrspace(1) %out ret float %result } ; Basic 4-value tree: maxnum f16 define half @v_max3_maxnum_tree4_f16(half %a, half %b, half %c, half %d) { ; GFX9-LABEL: v_max3_maxnum_tree4_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9-NEXT: v_max_f16_e32 v1, v3, v3 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-FAKE16-LABEL: v_max3_maxnum_tree4_f16: ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v3, v3 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-REAL16-LABEL: v_max3_maxnum_tree4_f16: ; GFX1250-REAL16: ; %bb.0: ; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.h, v3.l, v3.l ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %max.ab = call half @llvm.maxnum.f16(half %a, half %b) %max.cd = call half @llvm.maxnum.f16(half %c, half %d) %result = call half @llvm.maxnum.f16(half %max.ab, half %max.cd) ret half %result } ; Negative test: f64 has no max3/min3 on any target yet, tree combine must not fire define double @v_no_max3_maxnum_tree4_f64(double %a, double %b, double %c, double %d) { ; GFX9-LABEL: v_no_max3_maxnum_tree4_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_no_max3_maxnum_tree4_f64: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] ; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[6:7] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %max.ab = call double @llvm.maxnum.f64(double %a, double %b) %max.cd = call double @llvm.maxnum.f64(double %c, double %d) %result = call double @llvm.maxnum.f64(double %max.ab, double %max.cd) ret double %result } ; Negative test: bf16 is promoted to f32 with conversions, tree combine cannot apply define bfloat @v_no_max3_maxnum_tree4_bf16(bfloat %a, bfloat %b, bfloat %c, bfloat %d) { ; GFX9-LABEL: v_no_max3_maxnum_tree4_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_no_max3_maxnum_tree4_bf16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v3 ; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v0, v0, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %max.ab = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) %max.cd = call bfloat @llvm.maxnum.bf16(bfloat %c, bfloat %d) %result = call bfloat @llvm.maxnum.bf16(bfloat %max.ab, bfloat %max.cd) ret bfloat %result } declare float @llvm.maxnum.f32(float, float) declare float @llvm.minnum.f32(float, float) declare float @llvm.maximum.f32(float, float) declare float @llvm.minimum.f32(float, float) declare half @llvm.maxnum.f16(half, half) declare double @llvm.maxnum.f64(double, double) declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)