aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <rampitec@users.noreply.github.com>2024-05-07 14:20:13 -0700
committerGitHub <noreply@github.com>2024-05-07 14:20:13 -0700
commit2a3903fa0e88d7149df11aa37d4ba87c5e5f0913 (patch)
tree30e6a6622cf18c2e447f3262c8b480a407a7cd56
parentbc8a42762057d7036f6871211e62b1c3efb2738a (diff)
downloadllvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.zip
llvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.tar.gz
llvm-2a3903fa0e88d7149df11aa37d4ba87c5e5f0913.tar.bz2
[AMDGPU] Prevent FMINIMUM and FMAXIMUM beeing fully scalarized (#91378)
This is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE.
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmaximum.ll39
-rw-r--r--llvm/test/CodeGen/AMDGPU/fminimum.ll39
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll8
5 files changed, 63 insertions, 39 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ed41c10..33bdd61 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -854,9 +854,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasPrefetch())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
- if (Subtarget->hasIEEEMinMax())
+ if (Subtarget->hasIEEEMinMax()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+ setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
+ }
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
@@ -5821,6 +5825,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case ISD::UADDSAT:
case ISD::USUBSAT:
case ISD::SADDSAT:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index dd685a6..87ac95a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x ha
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fmaximum_v3f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT: v_maximum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v3f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT: s_maximum_f16 s0, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.maximum.f64(double, double)
declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 2b3cc4f..45f6bff 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x ha
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fminimum_v3f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT: v_minimum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v3f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT: s_minimum_f16 s0, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.minimum.f64(double, double)
declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index c49e6a9a..c476208 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1794,7 +1794,7 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2013,7 +2013,7 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2163,7 +2163,7 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2260,7 +2260,7 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 7281c3f..66f3a48 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -1461,7 +1461,7 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1635,7 +1635,7 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1740,7 +1740,7 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1792,7 +1792,7 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op