aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp53
1 files changed, 43 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2..bc0fd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FSIN, ISD::FROUND},
MVT::f16, Custom);
+ // BF16 - VOP1 Actions.
+ if (Subtarget->hasBF16TransInsts())
+ setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
+
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -2260,7 +2264,8 @@ SDValue SITargetLowering::getPreloadedValue(
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (Subtarget->hasArchitectedSGPRs() &&
- (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+ (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
@@ -2942,12 +2947,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!Subtarget->enableFlatScratch())
assert(!UserSGPRInfo.hasFlatScratchInit());
if ((CallConv != CallingConv::AMDGPU_CS &&
- CallConv != CallingConv::AMDGPU_Gfx) ||
+ CallConv != CallingConv::AMDGPU_Gfx &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
!Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
}
+ bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
if (CallConv == CallingConv::AMDGPU_PS) {
processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -2988,7 +2996,8 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- Splits.append(Ins.begin(), Ins.end());
+ Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+ Ins.end());
}
if (IsKernel)
@@ -3019,6 +3028,13 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
+ if (IsWholeWaveFunc) {
+ SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+ {MVT::i1, MVT::Other}, Chain);
+ InVals.push_back(Setup.getValue(0));
+ Chains.push_back(Setup.getValue(1));
+ }
+
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
@@ -3026,7 +3042,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+ ++i) {
const ISD::InputArg &Arg = Ins[i];
if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3374,7 +3391,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+ Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+ : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
+ : AMDGPUISD::RET_GLUE;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3876,7 +3895,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -5890,6 +5910,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+ assert(MFI->isWholeWaveFunction());
+
+ // During ISel, it's difficult to propagate the original EXEC mask to use as
+ // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+ MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+ Register OriginalExec = Setup->getOperand(0).getReg();
+ assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ MF->getRegInfo().clearKillFlags(OriginalExec);
+ MI.getOperand(0).setReg(OriginalExec);
+ return BB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
@@ -11172,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
// f16 is always accurate enough
- if (!AllowInaccurateRcp && VT != MVT::f16)
+ if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
return SDValue();
if (CLHS->isExactlyValue(1.0)) {
@@ -11199,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require afn or arcp.
+ // For f16 and bf16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+ if (!AllowInaccurateRcp &&
+ ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
return SDValue();
// Turn into multiply by the reciprocal.
@@ -11592,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f64)
return LowerFDIV64(Op, DAG);
- if (VT == MVT::f16)
+ if (VT == MVT::f16 || VT == MVT::bf16)
return LowerFDIV16(Op, DAG);
llvm_unreachable("Unexpected type for fdiv");