diff options
author | Stanislav Mekhanoshin <rampitec@users.noreply.github.com> | 2024-05-07 14:20:13 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-07 14:20:13 -0700 |
commit | 2a3903fa0e88d7149df11aa37d4ba87c5e5f0913 (patch) | |
tree | 30e6a6622cf18c2e447f3262c8b480a407a7cd56 | |
parent | bc8a42762057d7036f6871211e62b1c3efb2738a (diff) | |
download | llvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.zip llvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.tar.gz llvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.tar.bz2 |
[AMDGPU] Prevent FMINIMUM and FMAXIMUM beeing fully scalarized (#91378)
This is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE.
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmaximum.ll | 39 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fminimum.ll | 39 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 8 |
5 files changed, 63 insertions, 39 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ed41c10..33bdd61 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -854,9 +854,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasPrefetch()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); - if (Subtarget->hasIEEEMinMax()) + if (Subtarget->hasIEEEMinMax()) { setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); + setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Custom); + } setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, @@ -5821,6 +5825,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: case ISD::UADDSAT: case ISD::USUBSAT: case ISD::SADDSAT: diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index dd685a6..87ac95a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x ha } define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) { -; GCN-LABEL: test_fmaximum_v3f16_vv: -; GCN: ; %bb.0: -; GCN-NEXT: v_pk_maximum_f16 v0, v0, v2 -; GCN-NEXT: v_maximum_f16 v1, v1, v3 -; GCN-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-GISEL-NEXT: v_maximum_f16 v1, v1, v3 +; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) ret <3 x half> %val } define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) { -; GCN-LABEL: test_fmaximum_v3f16_ss: -; GCN: ; %bb.0: -; GCN-NEXT: v_pk_maximum_f16 v0, s0, s2 -; GCN-NEXT: s_maximum_f16 s0, s1, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, s0, s2 +; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, s1, s3 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: test_fmaximum_v3f16_ss: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2 +; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) ret <3 x half> %val } @@ -306,6 +318,3 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>) declare double @llvm.maximum.f64(double, double) declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-GISEL: {{.*}} -; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 2b3cc4f..45f6bff 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x ha } define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) { -; GCN-LABEL: test_fminimum_v3f16_vv: -; GCN: ; %bb.0: -; GCN-NEXT: v_pk_minimum_f16 v0, v0, v2 -; GCN-NEXT: v_minimum_f16 v1, v1, v3 -; GCN-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-GISEL-NEXT: v_minimum_f16 v1, v1, v3 +; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) ret <3 x half> %val } define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) { -; GCN-LABEL: test_fminimum_v3f16_ss: -; GCN: ; %bb.0: -; GCN-NEXT: v_pk_minimum_f16 v0, s0, s2 -; GCN-NEXT: s_minimum_f16 s0, s1, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, s0, s2 +; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, s1, s3 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: test_fminimum_v3f16_ss: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2 +; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) ret <3 x half> %val } @@ -306,6 +318,3 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>) declare double @llvm.minimum.f64(double, double) declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-GISEL: {{.*}} -; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index c49e6a9a..c476208 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1794,7 +1794,7 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 -; GFX12-NEXT: v_maximum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -2013,7 +2013,7 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 -; GFX12-NEXT: v_maximum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -2163,7 +2163,7 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 -; GFX12-NEXT: v_maximum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -2260,7 +2260,7 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 -; GFX12-NEXT: v_maximum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 7281c3f..66f3a48 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1461,7 +1461,7 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 -; GFX12-NEXT: v_minimum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -1635,7 +1635,7 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 -; GFX12-NEXT: v_minimum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -1740,7 +1740,7 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 -; GFX12-NEXT: v_minimum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op @@ -1792,7 +1792,7 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 -; GFX12-NEXT: v_minimum_f16 v1, v1, v3 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1) ret <3 x half> %op |