Merge pull request #4972 from obilaniu/cudafixes

CUDA fixes
author: Jussi Pakkanen <jpakkane@gmail.com> 2019-02-24 23:48:28 +0200
committer: GitHub <noreply@github.com> 2019-02-24 23:48:28 +0200
commit: 41fb5c2960b678ec7722d5f9b3c555757ba8a6bd (patch)
tree: 7643436b63bdf4773a3e0b3a9d012802ec01aaac
parent: 5b53335724c11e4443ac62bfbaea90881323d2b1 (diff)
parent: 104397a4293f78d3cbdd84f380cefb84ca54ec99 (diff)
download: meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.zip
meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.tar.gz
meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.tar.bz2
3 files changed, 106 insertions, 76 deletions
diff --git a/docs/markdown/Cuda-module.md b/docs/markdown/Cuda-module.md
index caa1756..f161eac 100644
--- a/docs/markdown/Cuda-module.md
+++ b/docs/markdown/Cuda-module.md
@@ -71,6 +71,14 @@ mixed with architecture names or compute capabilities. Their interpretation is:
 | `'Common'`        | Relatively common CCs supported by given NVCC compiler. Generally excludes Tegra and Tesla devices. |
 | `'Auto'`          | The CCs provided by the `detected:` keyword, filtered for support by given NVCC compiler. |
 
+As a special case, when `nvcc_arch_flags()` is invoked with
+
+- an NVCC `compiler` object `nvcc`,
+- `'Auto'` mode and
+- no `detected:` keyword,
+
+Meson uses `nvcc`'s architecture auto-detection results.
+
 The supported architecture names and their corresponding compute capabilities
 are:
 
@@ -85,7 +93,7 @@ are:
 | `'Pascal'`        | 6.0, 6.1           |
 | `'Pascal+Tegra'`  | 6.2                |
 | `'Volta'`         | 7.0                |
-| `'Volta+Tegra'`   | 7.2                |
+| `'Xavier'`        | 7.2                |
 | `'Turing'`        | 7.5                |
 
 
diff --git a/mesonbuild/compilers/cuda.py b/mesonbuild/compilers/cuda.py
index 66dcf33..d1964fd 100644
--- a/mesonbuild/compilers/cuda.py
+++ b/mesonbuild/compilers/cuda.py
@@ -47,35 +47,97 @@ class CudaCompiler(Compiler):
         return []
 
     def sanity_check(self, work_dir, environment):
-        source_name = os.path.join(work_dir, 'sanitycheckcuda.cu')
-        binary_name = os.path.join(work_dir, 'sanitycheckcuda')
-        extra_flags = self.get_cross_extra_flags(environment, link=False)
-        if self.is_cross:
-            extra_flags += self.get_compile_only_args()
-
-        code = '''
-__global__ void kernel (void) {
-
-}
+        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
+        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))
 
-        int main(int argc,char** argv){
+        sname = 'sanitycheckcuda.cu'
+        code = r'''
+        #include <cuda_runtime.h>
+        #include <stdio.h>
+
+        __global__ void kernel (void) {}
+
+        int main(void){
+            struct cudaDeviceProp prop;
+            int count, i;
+            cudaError_t ret = cudaGetDeviceCount(&count);
+            if(ret != cudaSuccess){
+                fprintf(stderr, "%d\n", (int)ret);
+            }else{
+                for(i=0;i<count;i++){
+                    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess){
+                        fprintf(stdout, "%d.%d\n", prop.major, prop.minor);
+                    }
+                }
+            }
+            fflush(stderr);
+            fflush(stdout);
             return 0;
         }
         '''
-
+        binname = sname.rsplit('.', 1)[0]
+        binname += '_cross' if self.is_cross else ''
+        source_name = os.path.join(work_dir, sname)
+        binary_name = os.path.join(work_dir, binname + '.exe')
         with open(source_name, 'w') as ofile:
             ofile.write(code)
-        pc = subprocess.Popen(self.exelist + extra_flags + [source_name, '-o', binary_name])
-        pc.wait()
+
+        # The Sanity Test for CUDA language will serve as both a sanity test
+        # and a native-build GPU architecture detection test, useful later.
+        #
+        # For this second purpose, NVCC has very handy flags, --run and
+        # --run-args, that allow one to run an application with the
+        # environment set up properly. Of course, this only works for native
+        # builds; For cross builds we must still use the exe_wrapper (if any).
+        self.detected_cc = ''
+        flags = ['-w', '-cudart', 'static', source_name]
+        if self.is_cross and self.exe_wrapper is None:
+            # Linking cross built apps is painful. You can't really
+            # tell if you should use -nostdlib or not and for example
+            # on OSX the compiler binary is the same but you need
+            # a ton of compiler flags to differentiate between
+            # arm and x86_64. So just compile.
+            flags += self.get_compile_only_args()
+        flags += self.get_output_args(binary_name)
+
+        # Compile sanity check
+        cmdlist = self.exelist + flags
+        mlog.debug('Sanity check compiler command line: ', ' '.join(cmdlist))
+        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
+        mlog.debug('Sanity check compile stdout: ')
+        mlog.debug(stdo)
+        mlog.debug('-----\nSanity check compile stderr:')
+        mlog.debug(stde)
+        mlog.debug('-----')
         if pc.returncode != 0:
-            raise EnvironmentException('Cuda compiler %s can not compile programs.' % self.name_string())
+            raise EnvironmentException('Compiler {0} can not compile programs.'.format(self.name_string()))
+
+        # Run sanity check (if possible)
         if self.is_cross:
-            # Can't check if the binaries run so we have to assume they do
-            return
-        pe = subprocess.Popen(binary_name)
+            if self.exe_wrapper is None:
+                return
+            else:
+                cmdlist = self.exe_wrapper + [binary_name]
+        else:
+            cmdlist = self.exelist + ['--run', '"' + binary_name + '"']
+        mlog.debug('Sanity check run command line: ', ' '.join(cmdlist))
+        pe, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
+        mlog.debug('Sanity check run stdout: ')
+        mlog.debug(stdo)
+        mlog.debug('-----\nSanity check run stderr:')
+        mlog.debug(stde)
+        mlog.debug('-----')
         pe.wait()
         if pe.returncode != 0:
-            raise EnvironmentException('Executables created by Cuda compiler %s are not runnable.' % self.name_string())
+            raise EnvironmentException('Executables created by {0} compiler {1} are not runnable.'.format(self.language, self.name_string()))
+
+        # Interpret the result of the sanity test.
+        # As mentionned above, it is not only a sanity test but also a GPU
+        # architecture detection test.
+        if stde == '':
+            self.detected_cc = stdo
+        else:
+            mlog.debug('cudaGetDeviceCount() returned ' + stde)
 
     def get_compiler_check_args(self):
         return super().get_compiler_check_args() + []
@@ -92,56 +154,6 @@ __global__ void kernel (void) {
         int main () {{ return 0; }}'''
         return self.compiles(t.format(**fargs), env, extra_args, dependencies)
 
-    def sanity_check_impl(self, work_dir, environment, sname, code):
-        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
-        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))
-
-        extra_flags = []
-        source_name = os.path.join(work_dir, sname)
-        binname = sname.rsplit('.', 1)[0]
-        if self.is_cross:
-            binname += '_cross'
-            if self.exe_wrapper is None:
-                # Linking cross built apps is painful. You can't really
-                # tell if you should use -nostdlib or not and for example
-                # on OSX the compiler binary is the same but you need
-                # a ton of compiler flags to differentiate between
-                # arm and x86_64. So just compile.
-                extra_flags += self.get_cross_extra_flags(environment, link=False)
-                extra_flags += self.get_compile_only_args()
-            else:
-                extra_flags += self.get_cross_extra_flags(environment, link=True)
-        # Is a valid executable output for all toolchains and platforms
-        binname += '.exe'
-        # Write binary check source
-        binary_name = os.path.join(work_dir, binname)
-        with open(source_name, 'w') as ofile:
-            ofile.write(code)
-        # Compile sanity check
-        cmdlist = self.exelist + extra_flags + [source_name] + self.get_output_args(binary_name)
-        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
-        mlog.debug('Sanity check compiler command line:', ' '.join(cmdlist))
-        mlog.debug('Sanity check compile stdout:')
-        mlog.debug(stdo)
-        mlog.debug('-----\nSanity check compile stderr:')
-        mlog.debug(stde)
-        mlog.debug('-----')
-        if pc.returncode != 0:
-            raise EnvironmentException('Compiler {0} can not compile programs.'.format(self.name_string()))
-        # Run sanity check
-        if self.is_cross:
-            if self.exe_wrapper is None:
-                # Can't check if the binaries run so we have to assume they do
-                return
-            cmdlist = self.exe_wrapper + [binary_name]
-        else:
-            cmdlist = [binary_name]
-        mlog.debug('Running test binary command: ' + ' '.join(cmdlist))
-        pe = subprocess.Popen(cmdlist)
-        pe.wait()
-        if pe.returncode != 0:
-            raise EnvironmentException('Executables created by {0} compiler {1} are not runnable.'.format(self.language, self.name_string()))
-
     @staticmethod
     def _cook_link_args(args):
         """
@@ -176,7 +188,7 @@ __global__ void kernel (void) {
         return cuda_debug_args[is_debug]
 
     def get_werror_args(self):
-        return ['-Werror']
+        return ['-Werror=cross-execution-space-call,deprecated-declarations,reorder']
 
     def get_linker_exelist(self):
         return self.exelist[:]
diff --git a/mesonbuild/modules/unstable_cuda.py b/mesonbuild/modules/unstable_cuda.py
index 941b15a..1a74973 100644
--- a/mesonbuild/modules/unstable_cuda.py
+++ b/mesonbuild/modules/unstable_cuda.py
@@ -77,11 +77,19 @@ class CudaModule(ExtensionModule):
 
     @staticmethod
     def _break_arch_string(s):
-        s = re.sub('[ \t,;]+', ';', s)
+        s = re.sub('[ \t\r\n,;]+', ';', s)
         s = s.strip(';').split(';')
         return s
 
     @staticmethod
+    def _detected_cc_from_compiler(c):
+        if isinstance(c, CompilerHolder):
+            c = c.compiler
+        if isinstance(c, CudaCompiler):
+            return c.detected_cc
+        return ''
+
+    @staticmethod
     def _version_from_compiler(c):
         if isinstance(c, CompilerHolder):
             c = c.compiler
@@ -97,7 +105,8 @@ class CudaModule(ExtensionModule):
         if len(args) < 1:
             raise argerror
         else:
-            cuda_version = self._version_from_compiler(args[0])
+            compiler = args[0]
+            cuda_version = self._version_from_compiler(compiler)
             if cuda_version == 'unknown':
                 raise argerror
 
@@ -108,7 +117,8 @@ class CudaModule(ExtensionModule):
             raise InvalidArguments('''The special architectures 'All', 'Common' and 'Auto' must appear alone, as a positional argument!''')
         arch_list = arch_list[0] if len(arch_list) == 1 else arch_list
 
-        detected = flatten([kwargs.get('detected', [])])
+        detected = kwargs.get('detected', self._detected_cc_from_compiler(compiler))
+        detected = flatten([detected])
         detected = [self._break_arch_string(a) for a in detected]
         detected = flatten(detected)
         if not set(detected).isdisjoint({'All', 'Common', 'Auto'}):
@@ -148,7 +158,7 @@ class CudaModule(ExtensionModule):
                 cuda_limit_gpu_architecture    = '7.0'        # noqa: E221
 
         if version_compare(cuda_version, '>=9.0'):
-            cuda_known_gpu_architectures  += ['Volta', 'Volta+Tegra']              # noqa: E221
+            cuda_known_gpu_architectures  += ['Volta', 'Xavier']                   # noqa: E221
             cuda_common_gpu_architectures += ['7.0', '7.0+PTX']                    # noqa: E221
             cuda_all_gpu_architectures    += ['7.0', '7.0+PTX', '7.2', '7.2+PTX']  # noqa: E221
 
@@ -215,7 +225,7 @@ class CudaModule(ExtensionModule):
                     'Pascal':        (['6.0', '6.1'],      ['6.1']),
                     'Pascal+Tegra':  (['6.2'],             []),
                     'Volta':         (['7.0'],             ['7.0']),
-                    'Volta+Tegra':   (['7.2'],             []),
+                    'Xavier':        (['7.2'],             []),
                     'Turing':        (['7.5'],             ['7.5']),
                 }.get(arch_name, (None, None))
author	Jussi Pakkanen <jpakkane@gmail.com>	2019-02-24 23:48:28 +0200
committer	GitHub <noreply@github.com>	2019-02-24 23:48:28 +0200
commit	41fb5c2960b678ec7722d5f9b3c555757ba8a6bd (patch)
tree	7643436b63bdf4773a3e0b3a9d012802ec01aaac
parent	5b53335724c11e4443ac62bfbaea90881323d2b1 (diff)
parent	104397a4293f78d3cbdd84f380cefb84ca54ec99 (diff)
download	meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.zip meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.tar.gz meson-41fb5c2960b678ec7722d5f9b3c555757ba8a6bd.tar.bz2