diff options
author | Samuel Antao <sfantao@us.ibm.com> | 2016-07-15 23:13:27 +0000 |
---|---|---|
committer | Samuel Antao <sfantao@us.ibm.com> | 2016-07-15 23:13:27 +0000 |
commit | d06239d359df71ee594c1b305fe71af59793abc2 (patch) | |
tree | dddb83be3fd986337b5870fcad6b90b5e0ac9dc5 /clang/lib/Driver/Tools.cpp | |
parent | 4953a01461a2363ee94165fde470c929539e9376 (diff) | |
download | llvm-d06239d359df71ee594c1b305fe71af59793abc2.zip llvm-d06239d359df71ee594c1b305fe71af59793abc2.tar.gz llvm-d06239d359df71ee594c1b305fe71af59793abc2.tar.bz2 |
[CUDA][OpenMP] Create generic offload action
Summary:
This patch replaces the CUDA specific action by a generic offload action. The offload action may have multiple dependences classier in “host” and “device”. The way this generic offloading action is used is very similar to what is done today by the CUDA implementation: it is used to set a specific toolchain and architecture to its dependences during the generation of jobs.
This patch also proposes propagating the offloading information through the action graph so that that information can be easily retrieved at any time during the generation of commands. This allows e.g. the "clang tool” to evaluate whether CUDA should be supported for the device or host and ptas to easily retrieve the target architecture.
This is an example of how the action graphs would look like (compilation of a single CUDA file with two GPU architectures)
```
0: input, "cudatests.cu", cuda, (host-cuda)
1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
2: compiler, {1}, ir, (host-cuda)
3: input, "cudatests.cu", cuda, (device-cuda, sm_35)
4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_35)
5: compiler, {4}, ir, (device-cuda, sm_35)
6: backend, {5}, assembler, (device-cuda, sm_35)
7: assembler, {6}, object, (device-cuda, sm_35)
8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {7}, object
9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {6}, assembler
10: input, "cudatests.cu", cuda, (device-cuda, sm_37)
11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_37)
12: compiler, {11}, ir, (device-cuda, sm_37)
13: backend, {12}, assembler, (device-cuda, sm_37)
14: assembler, {13}, object, (device-cuda, sm_37)
15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {14}, object
16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {13}, assembler
17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
18: offload, "host-cuda (powerpc64le-unknown-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
19: backend, {18}, assembler
20: assembler, {19}, object
21: input, "cuda", object
22: input, "cudart", object
23: linker, {20, 21, 22}, image
```
The changes in this patch pass the existent regression tests (keeps the existent functionality) and resulting binaries execute correctly in a Power8+K40 machine.
Reviewers: echristo, hfinkel, jlebar, ABataev, tra
Subscribers: guansong, andreybokhanko, tcramer, mkuron, cfe-commits, arpith-jacob, carlo.bertolli, caomhin
Differential Revision: https://reviews.llvm.org/D18171
llvm-svn: 275645
Diffstat (limited to 'clang/lib/Driver/Tools.cpp')
-rw-r--r-- | clang/lib/Driver/Tools.cpp | 110 |
1 files changed, 72 insertions, 38 deletions
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp index 63284bc..df4a996 100644 --- a/clang/lib/Driver/Tools.cpp +++ b/clang/lib/Driver/Tools.cpp @@ -296,12 +296,45 @@ static bool forwardToGCC(const Option &O) { !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); } +/// Add the C++ include args of other offloading toolchains. If this is a host +/// job, the device toolchains are added. If this is a device job, the host +/// toolchains will be added. +static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain<Action::OFK_Cuda>() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain<Action::OFK_Host>() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + +/// Add the include args that are specific of each offloading programming model. +static void addExtraOffloadSpecificIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain<Action::OFK_Host>()->AddCudaIncludeArgs( + Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain<Action::OFK_Cuda>()->AddCudaIncludeArgs( + Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const { + const InputInfoList &Inputs) const { Arg *A; const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU(); @@ -566,31 +599,27 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); - // Optional AuxToolChain indicates that we need to include headers - // for more than one target. If that's the case, add include paths - // from AuxToolChain right after include paths of the same kind for - // the current target. + // While adding the include arguments, we also attempt to retrieve the + // arguments of related offloading toolchains or arguments that are specific + // of an offloading programming model. // Add C++ include arguments, if needed. if (types::isCXX(Inputs[0].getType())) { getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } // Add system include arguments for all targets but IAMCU. if (!IsIAMCU) { getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } else { // For IAMCU add special include arguments. getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs); } - // Add CUDA include arguments, if needed. - if (types::isCuda(Inputs[0].getType())) - getToolChain().AddCudaIncludeArgs(Args, CmdArgs); + // Add offload include arguments, if needed. + addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs); } // FIXME: Move to target hook. @@ -3799,7 +3828,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFK_Cuda); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // C++ is not supported for IAMCU. @@ -3815,21 +3844,21 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgString(TripleStr)); - const ToolChain *AuxToolChain = nullptr; if (IsCuda) { - // FIXME: We need a (better) way to pass information about - // particular compilation pass we're constructing here. For now we - // can check which toolchain we're using and pick the other one to - // extract the triple. - if (&getToolChain() == C.getSingleOffloadToolChain<Action::OFK_Cuda>()) - AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) - AuxToolChain = C.getSingleOffloadToolChain<Action::OFK_Cuda>(); + // We have to pass the triple of the host if compiling for a CUDA device and + // vice-versa. + StringRef NormalizedTriple; + if (JA.isDeviceOffloading(Action::OFK_Cuda)) + NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>() + ->getTriple() + .normalize(); else - llvm_unreachable("Can't figure out CUDA compilation mode."); - assert(AuxToolChain != nullptr && "No aux toolchain."); + NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>() + ->getTriple() + .normalize(); + CmdArgs.push_back("-aux-triple"); - CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); + CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); } if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm || @@ -4718,8 +4747,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) - AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs, - AuxToolChain); + AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". @@ -11193,15 +11221,14 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, static_cast<const toolchains::CudaToolChain &>(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector<std::string> gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch()); + assert(gpu_arch != CudaArch::UNKNOWN && + "Device action expected to have an architecture."); // Check that our installation's ptxas supports gpu_arch. if (!Args.hasArg(options::OPT_no_cuda_version_check)) { - TC.cudaInstallation().CheckCudaVersionSupportsArch( - StringToCudaArch(gpu_arch)); + TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch); } ArgStringList CmdArgs; @@ -11245,7 +11272,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, } CmdArgs.push_back("--gpu-name"); - CmdArgs.push_back(Args.MakeArgString(gpu_arch)); + CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) @@ -11277,13 +11304,20 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast<const CudaDeviceAction>(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch_str = A->getOffloadingArch(); + assert(gpu_arch_str && + "Device action expected to have associated a GPU architecture!"); + CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. const char *Arch = (II.getType() == types::TY_PP_Asm) - ? CudaVirtualArchToString(VirtualArchForCudaArch(A->getGpuArch())) - : CudaArchToString(A->getGpuArch()); + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : gpu_arch_str; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } |