diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-06 21:38:27 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-06 21:38:27 +0000 |
commit | f8768bfc846cc3e007f096c530f4f5c17f85eb58 (patch) | |
tree | 51aa6bbfda978694655e9ed58e60e9b6f4651e08 | |
parent | e262bb1afb391eb705cf2fa733e04cca6d51bf47 (diff) | |
download | llvm-f8768bfc846cc3e007f096c530f4f5c17f85eb58.zip llvm-f8768bfc846cc3e007f096c530f4f5c17f85eb58.tar.gz llvm-f8768bfc846cc3e007f096c530f4f5c17f85eb58.tar.bz2 |
AMDGPU: Fix implementation of isCanonicalized
If denormals are enabled, denormals are canonical.
Also fix a few other issues. minnum/maxnum are supposed
to canonicalize. Temporarily improve workaround for the
instruction behavior change in gfx9.
Handle selects and fcopysign.
The tests were also largely broken, since they were
checking for a flush used on some targets after the
store of the result.
llvm-svn: 339061
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 119 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll | 286 |
3 files changed, 315 insertions, 94 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9f4846f8..22d33f5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6745,70 +6745,99 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, return AMDGPUTargetLowering::performRcpCombine(N, DCI); } -static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, - const GCNSubtarget *ST, unsigned MaxDepth=5) { +bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, + unsigned MaxDepth) const { + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::FCANONICALIZE) + return true; + + if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { + auto F = CFP->getValueAPF(); + if (F.isNaN() && F.isSignaling()) + return false; + return !F.isDenormal() || denormalsEnabledForType(Op.getValueType()); + } + // If source is a result of another standard FP operation it is already in // canonical form. + if (MaxDepth == 0) + return false; - switch (Op.getOpcode()) { - default: - break; - + switch (Opcode) { // These will flush denorms if required. case ISD::FADD: case ISD::FSUB: case ISD::FMUL: - case ISD::FSQRT: case ISD::FCEIL: case ISD::FFLOOR: case ISD::FMA: case ISD::FMAD: - - case ISD::FCANONICALIZE: + case ISD::FSQRT: + case ISD::FDIV: + case ISD::FREM: + case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::FMAD_FTZ: return true; - case ISD::FP_ROUND: return Op.getValueType().getScalarType() != MVT::f16 || - ST->hasFP16Denormals(); + Subtarget->hasFP16Denormals(); case ISD::FP_EXTEND: return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || - ST->hasFP16Denormals(); + Subtarget->hasFP16Denormals(); // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. case ISD::FNEG: case ISD::FABS: - return (MaxDepth > 0) && - isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); + case ISD::FCOPYSIGN: + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); case ISD::FSIN: case ISD::FCOS: case ISD::FSINCOS: return Op.getValueType().getScalarType() != MVT::f16; - // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. - // For such targets need to check their input recursively. case ISD::FMINNUM: - case ISD::FMAXNUM: - case ISD::FMINNAN: - case ISD::FMAXNAN: + case ISD::FMAXNUM: { + // FIXME: Shouldn't treat the generic operations different based these. + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + if (IsIEEEMode) { + // snans will be quieted, so we only need to worry about denormals. + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(Op.getValueType())) + return true; - if (ST->supportsMinMaxDenormModes() && - DAG.isKnownNeverNaN(Op.getOperand(0)) && - DAG.isKnownNeverNaN(Op.getOperand(1))) - return true; + // Flushing may be required. + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such + // targets need to check their input recursively. + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); + } - return (MaxDepth > 0) && - isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(Op.getValueType())) { + // Only quieting may be necessary. + return DAG.isKnownNeverSNaN(Op.getOperand(0)) && + DAG.isKnownNeverSNaN(Op.getOperand(1)); + } - case ISD::ConstantFP: { - auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); - return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); + // Flushing and quieting may be necessary + // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it + // needs to be quieted. + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); } + case ISD::SELECT: { + return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); } - return false; + default: + return denormalsEnabledForType(Op.getValueType()) && + DAG.isKnownNeverSNaN(Op); + } + + llvm_unreachable("invalid operation"); } // Constant fold canonicalize. @@ -6828,22 +6857,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0); if (!CFP) { SDValue N0 = N->getOperand(0); - EVT VT = N0.getValueType().getScalarType(); - auto ST = getSubtarget(); - - if (((VT == MVT::f32 && ST->hasFP32Denormals()) || - (VT == MVT::f64 && ST->hasFP64Denormals()) || - (VT == MVT::f16 && ST->hasFP16Denormals())) && - DAG.isKnownNeverNaN(N0)) - return N0; - - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); - - if ((IsIEEEMode || DAG.isKnownNeverSNaN(N0)) && - isCanonicalized(DAG, N0, ST)) - return N0; - - return SDValue(); + return isCanonicalized(DAG, N0) ? N0 : SDValue(); } const APFloat &C = CFP->getValueAPF(); @@ -8462,3 +8476,16 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, } return false; } + +bool SITargetLowering::denormalsEnabledForType(EVT VT) const { + switch (VT.getScalarType().getSimpleVT().SimpleTy) { + case MVT::f32: + return Subtarget->hasFP32Denormals(); + case MVT::f64: + return Subtarget->hasFP64Denormals(); + case MVT::f16: + return Subtarget->hasFP16Denormals(); + default: + return false; + } +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 827c369..be0f5a8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -323,6 +323,10 @@ public: bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + unsigned MaxDepth = 5) const; + bool denormalsEnabledForType(EVT VT) const; }; } // End namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index f281974..4005c4d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -1,7 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,GCN-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} @@ -29,10 +30,26 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace ret void } +; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32: +; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max +; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32: ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -45,8 +62,9 @@ define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32: ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -59,8 +77,9 @@ define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32: ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -73,8 +92,9 @@ define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32: ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -87,8 +107,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspac ; GCN-LABEL: test_fold_canonicalize_floor_value_f32: ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -101,8 +122,9 @@ define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspac ; GCN-LABEL: test_fold_canonicalize_fma_value_f32: ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -113,11 +135,27 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace( ret void } +; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: +; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max +; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 +; GCN-NOT: v_mul +; GCN-NOT: v_max +; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -132,8 +170,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrsp ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]], ; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]] +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -146,8 +185,9 @@ define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float a ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -161,8 +201,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float a ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -176,8 +217,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half ad ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -211,8 +253,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float ad ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]] ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id @@ -239,8 +282,9 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrsp ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32: ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -265,10 +309,28 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrsp ret void } +; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32: +; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| +; GCN-NOT: v_mul_ +; GCN-NOT: v_max_ +define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %canon.load = tail call float @llvm.canonicalize.f32(float %load) + %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign) + %v = tail call float @llvm.fabs.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32: ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -282,8 +344,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace ; GCN-LABEL: test_fold_canonicalize_sin_value_f32: ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -296,8 +359,9 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace( ; GCN-LABEL: test_fold_canonicalize_cos_value_f32: ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -311,8 +375,9 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace( ; GCN-LABEL: test_fold_canonicalize_sin_value_f16: ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -326,8 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16: ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -340,8 +406,9 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32: ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -350,11 +417,40 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace ret void } -; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: -; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GFX9-NOT: v_max +; GFX9-NOT: v_mul + +; VI-DENORM-NOT: v_max_f32 +; VI-DENORM-NOT: v_mul_f32 + +; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} + ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode: +; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} + +; GFX9-NOT: v_max +; GFX9-NOT: v_mul + + +; VI-DENORM-NOT: v_max +; VI-DENORM-NOT: v_mul +; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} + +; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}] +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id %load = load float, float addrspace(1)* %gep, align 4 @@ -366,8 +462,9 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(flo ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -382,9 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspa ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away. ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: -; VI: v_add_u32_e32 v{{[0-9]+}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}} -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}] +; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]] +; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]] +; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]] define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -397,10 +494,16 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: ; GFX9: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}} -; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} -; VI: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] + +; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] + +; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}} + + +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]] -; GFX9-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -411,13 +514,17 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa ret void } -; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: +; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: ; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}} -; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] +; VI-FLUSH: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] + +; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}} + +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]] -; GFX9-NOT: 1.0 -define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id %load = load float, float addrspace(1)* %gep, align 4 @@ -429,8 +536,9 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(flo ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32: ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN-NOT: v_max +; GCN-NOT: v_mul ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -444,8 +552,9 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspa ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64: ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -457,12 +566,12 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrsp ret void } -; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee: ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN-NEXT: ; return -define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { entry: %v = fmul float %arg, 15.0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) @@ -471,8 +580,9 @@ entry: ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee: ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max ; GCN-NEXT: ; return -; GCN-NOT: 1.0 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) { entry: %v = fmul nnan float %arg, 15.0 @@ -480,6 +590,18 @@ entry: ret float %canonicalized } +; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee: +; GCN: v_div_fixup_f32 +; GCN-NOT: v_max +; GCN-NOT: v_mul +; GCN: ; return +define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) { +entry: + %v = fdiv float 15.0, %arg0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32 ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]], ; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]] @@ -498,7 +620,8 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addr ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]], ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 +; GCN-NOT: v_mul_ +; GCN-NOT: v_max_ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -511,8 +634,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double add ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]], -; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 +; GCN-NOT: v_mul +; GCN-NOT: v_max +; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -523,11 +647,77 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrs ret void } +; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32: +; GCN: v_add_f32 +; GCN: v_add_f32 +; GCN: v_cndmask_b32 +; GCN-NOT: v_mul_ +; GCN-NOT: v_max_ +define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load0 = load volatile float, float addrspace(1)* %gep, align 4 + %load1 = load volatile float, float addrspace(1)* %gep, align 4 + %load2 = load volatile i32, i32 addrspace(1)* undef, align 4 + %v0 = fadd float %load0, 15.0 + %v1 = fadd float %load1, 32.0 + %cond = icmp eq i32 %load2, 0 + %select = select i1 %cond, float %v0, float %v1 + %canonicalized = tail call float @llvm.canonicalize.f32(float %select) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; Need to quiet the nan with a separate instruction since it will be +; passed through the minnum. + +; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode: +; GFX9: v_min_f32_e32 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: ; return to shader + +; VI: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0 +; VI-DENORM: v_max_f32_e32 v0, v0, v0 +define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) { + %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode: +; GFX9: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 + +; VI: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_setpc_b64 +define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) { + %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +; Canonicalizing flush necessary pre-gfx9 +; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan: +; GCN: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: ; return +define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 { + %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive ; CHECK: .amd_amdgpu_isa declare float @llvm.canonicalize.f32(float) #0 +declare float @llvm.copysign.f32(float, float) #0 +declare float @llvm.amdgcn.fmul.legacy(float, float) #0 +declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0 declare double @llvm.canonicalize.f64(double) #0 declare half @llvm.canonicalize.f16(half) #0 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 |