aboutsummaryrefslogtreecommitdiff
path: root/mesonbuild/compilers/cuda.py
blob: 78a58c764d15d212ff70e42fa7d1d60511d95880 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
# Copyright 2012-2017 The Meson development team

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import enum
import os.path
import string
import typing as T

from .. import coredata
from .. import mlog
from ..mesonlib import (
    EnvironmentException, MachineChoice, Popen_safe, OptionOverrideProxy,
    is_windows, LibType, OptionKey,
)
from .compilers import (Compiler, cuda_buildtype_args, cuda_optimization_args,
                        cuda_debug_args, CompileCheckMode)

if T.TYPE_CHECKING:
    from ..build import BuildTarget
    from ..coredata import MutableKeyedOptionDictType, KeyedOptionDictType
    from ..dependencies import Dependency
    from ..environment import Environment  # noqa: F401
    from ..envconfig import MachineInfo
    from ..linkers import DynamicLinker
    from ..programs import ExternalProgram


class _Phase(enum.Enum):

    COMPILER = 'compiler'
    LINKER = 'linker'


class CudaCompiler(Compiler):

    LINKER_PREFIX = '-Xlinker='
    language = 'cuda'

    # NVCC flags taking no arguments.
    _FLAG_PASSTHRU_NOARGS = {
        # NVCC --long-option,                   NVCC -short-option              CUDA Toolkit 11.2.1 Reference
        '--objdir-as-tempdir',                  '-objtemp',                     # 4.2.1.2
        '--generate-dependency-targets',        '-MP',                          # 4.2.1.12
        '--allow-unsupported-compiler',         '-allow-unsupported-compiler',  # 4.2.1.14
        '--link',                                                               # 4.2.2.1
        '--lib',                                '-lib',                         # 4.2.2.2
        '--device-link',                        '-dlink',                       # 4.2.2.3
        '--device-c',                           '-dc',                          # 4.2.2.4
        '--device-w',                           '-dw',                          # 4.2.2.5
        '--cuda',                               '-cuda',                        # 4.2.2.6
        '--compile',                            '-c',                           # 4.2.2.7
        '--fatbin',                             '-fatbin',                      # 4.2.2.8
        '--cubin',                              '-cubin',                       # 4.2.2.9
        '--ptx',                                '-ptx',                         # 4.2.2.10
        '--preprocess',                         '-E',                           # 4.2.2.11
        '--generate-dependencies',              '-M',                           # 4.2.2.12
        '--generate-nonsystem-dependencies',    '-MM',                          # 4.2.2.13
        '--generate-dependencies-with-compile', '-MD',                          # 4.2.2.14
        '--generate-nonsystem-dependencies-with-compile', '-MMD',               # 4.2.2.15
        '--run',                                                                # 4.2.2.16
        '--profile',                            '-pg',                          # 4.2.3.1
        '--debug',                              '-g',                           # 4.2.3.2
        '--device-debug',                       '-G',                           # 4.2.3.3
        '--extensible-whole-program',           '-ewp',                         # 4.2.3.4
        '--generate-line-info',                 '-lineinfo',                    # 4.2.3.5
        '--dlink-time-opt',                     '-dlto',                        # 4.2.3.8
        '--no-exceptions',                      '-noeh',                        # 4.2.3.11
        '--shared',                             '-shared',                      # 4.2.3.12
        '--no-host-device-initializer-list',    '-nohdinitlist',                # 4.2.3.15
        '--expt-relaxed-constexpr',             '-expt-relaxed-constexpr',      # 4.2.3.16
        '--extended-lambda',                    '-extended-lambda',             # 4.2.3.17
        '--expt-extended-lambda',               '-expt-extended-lambda',        # 4.2.3.18
        '--m32',                                '-m32',                         # 4.2.3.20
        '--m64',                                '-m64',                         # 4.2.3.21
        '--forward-unknown-to-host-compiler',   '-forward-unknown-to-host-compiler', # 4.2.5.1
        '--forward-unknown-to-host-linker',     '-forward-unknown-to-host-linker',   # 4.2.5.2
        '--dont-use-profile',                   '-noprof',                      # 4.2.5.3
        '--dryrun',                             '-dryrun',                      # 4.2.5.5
        '--verbose',                            '-v',                           # 4.2.5.6
        '--keep',                               '-keep',                        # 4.2.5.7
        '--save-temps',                         '-save-temps',                  # 4.2.5.9
        '--clean-targets',                      '-clean',                       # 4.2.5.10
        '--no-align-double',                                                    # 4.2.5.16
        '--no-device-link',                     '-nodlink',                     # 4.2.5.17
        '--allow-unsupported-compiler',         '-allow-unsupported-compiler',  # 4.2.5.18
        '--use_fast_math',                      '-use_fast_math',               # 4.2.7.7
        '--extra-device-vectorization',         '-extra-device-vectorization',  # 4.2.7.12
        '--compile-as-tools-patch',             '-astoolspatch',                # 4.2.7.13
        '--keep-device-functions',              '-keep-device-functions',       # 4.2.7.14
        '--disable-warnings',                   '-w',                           # 4.2.8.1
        '--source-in-ptx',                      '-src-in-ptx',                  # 4.2.8.2
        '--restrict',                           '-restrict',                    # 4.2.8.3
        '--Wno-deprecated-gpu-targets',         '-Wno-deprecated-gpu-targets',  # 4.2.8.4
        '--Wno-deprecated-declarations',        '-Wno-deprecated-declarations', # 4.2.8.5
        '--Wreorder',                           '-Wreorder',                    # 4.2.8.6
        '--Wdefault-stream-launch',             '-Wdefault-stream-launch',      # 4.2.8.7
        '--Wext-lambda-captures-this',          '-Wext-lambda-captures-this',   # 4.2.8.8
        '--display-error-number',               '-err-no',                      # 4.2.8.10
        '--resource-usage',                     '-res-usage',                   # 4.2.8.14
        '--help',                               '-h',                           # 4.2.8.15
        '--version',                            '-V',                           # 4.2.8.16
        '--list-gpu-code',                      '-code-ls',                     # 4.2.8.20
        '--list-gpu-arch',                      '-arch-ls',                     # 4.2.8.21
    }
    # Dictionary of NVCC flags taking either one argument or a comma-separated list.
    # Maps --long to -short options, because the short options are more GCC-like.
    _FLAG_LONG2SHORT_WITHARGS = {
        '--output-file':                        '-o',                           # 4.2.1.1
        '--pre-include':                        '-include',                     # 4.2.1.3
        '--library':                            '-l',                           # 4.2.1.4
        '--define-macro':                       '-D',                           # 4.2.1.5
        '--undefine-macro':                     '-U',                           # 4.2.1.6
        '--include-path':                       '-I',                           # 4.2.1.7
        '--system-include':                     '-isystem',                     # 4.2.1.8
        '--library-path':                       '-L',                           # 4.2.1.9
        '--output-directory':                   '-odir',                        # 4.2.1.10
        '--dependency-output':                  '-MF',                          # 4.2.1.11
        '--compiler-bindir':                    '-ccbin',                       # 4.2.1.13
        '--archiver-binary':                    '-arbin',                       # 4.2.1.15
        '--cudart':                             '-cudart',                      # 4.2.1.16
        '--cudadevrt':                          '-cudadevrt',                   # 4.2.1.17
        '--libdevice-directory':                '-ldir',                        # 4.2.1.18
        '--target-directory':                   '-target-dir',                  # 4.2.1.19
        '--optimization-info':                  '-opt-info',                    # 4.2.3.6
        '--optimize':                           '-O',                           # 4.2.3.7
        '--ftemplate-backtrace-limit':          '-ftemplate-backtrace-limit',   # 4.2.3.9
        '--ftemplate-depth':                    '-ftemplate-depth',             # 4.2.3.10
        '--x':                                  '-x',                           # 4.2.3.13
        '--std':                                '-std',                         # 4.2.3.14
        '--machine':                            '-m',                           # 4.2.3.19
        '--compiler-options':                   '-Xcompiler',                   # 4.2.4.1
        '--linker-options':                     '-Xlinker',                     # 4.2.4.2
        '--archive-options':                    '-Xarchive',                    # 4.2.4.3
        '--ptxas-options':                      '-Xptxas',                      # 4.2.4.4
        '--nvlink-options':                     '-Xnvlink',                     # 4.2.4.5
        '--threads':                            '-t',                           # 4.2.5.4
        '--keep-dir':                           '-keep-dir',                    # 4.2.5.8
        '--run-args':                           '-run-args',                    # 4.2.5.11
        '--input-drive-prefix':                 '-idp',                         # 4.2.5.12
        '--dependency-drive-prefix':            '-ddp',                         # 4.2.5.13
        '--drive-prefix':                       '-dp',                          # 4.2.5.14
        '--dependency-target-name':             '-MT',                          # 4.2.5.15
        '--default-stream':                     '-default-stream',              # 4.2.6.1
        '--gpu-architecture':                   '-arch',                        # 4.2.7.1
        '--gpu-code':                           '-code',                        # 4.2.7.2
        '--generate-code':                      '-gencode',                     # 4.2.7.3
        '--relocatable-device-code':            '-rdc',                         # 4.2.7.4
        '--entries':                            '-e',                           # 4.2.7.5
        '--maxrregcount':                       '-maxrregcount',                # 4.2.7.6
        '--ftz':                                '-ftz',                         # 4.2.7.8
        '--prec-div':                           '-prec-div',                    # 4.2.7.9
        '--prec-sqrt':                          '-prec-sqrt',                   # 4.2.7.10
        '--fmad':                               '-fmad',                        # 4.2.7.11
        '--Werror':                             '-Werror',                      # 4.2.8.9
        '--diag-error':                         '-diag-error',                  # 4.2.8.11
        '--diag-suppress':                      '-diag-suppress',               # 4.2.8.12
        '--diag-warn':                          '-diag-warn',                   # 4.2.8.13
        '--options-file':                       '-optf',                        # 4.2.8.17
        '--time':                               '-time',                        # 4.2.8.18
        '--qpp-config':                         '-qpp-config',                  # 4.2.8.19
    }
    # Reverse map -short to --long options.
    _FLAG_SHORT2LONG_WITHARGS = {v: k for k, v in _FLAG_LONG2SHORT_WITHARGS.items()}

    id = 'nvcc'

    def __init__(self, exelist: T.List[str], version: str, for_machine: MachineChoice,
                 is_cross: bool, exe_wrapper: T.Optional['ExternalProgram'],
                 host_compiler: Compiler, info: 'MachineInfo',
                 linker: T.Optional['DynamicLinker'] = None,
                 full_version: T.Optional[str] = None):
        super().__init__(exelist, version, for_machine, info, linker=linker, full_version=full_version, is_cross=is_cross)
        self.exe_wrapper = exe_wrapper
        self.host_compiler = host_compiler
        self.base_options = host_compiler.base_options
        self.warn_args = {level: self._to_host_flags(flags) for level, flags in host_compiler.warn_args.items()}

    @classmethod
    def _shield_nvcc_list_arg(cls, arg: str, listmode: bool = True) -> str:
        r"""
        Shield an argument against both splitting by NVCC's list-argument
        parse logic, and interpretation by any shell.

        NVCC seems to consider every comma , that is neither escaped by \ nor inside
        a double-quoted string a split-point. Single-quotes do not provide protection
        against splitting; In fact, after splitting they are \-escaped. Unfortunately,
        double-quotes don't protect against shell expansion. What follows is a
        complex dance to accommodate everybody.
        """

        SQ = "'"
        DQ = '"'
        CM = ","
        BS = "\\"
        DQSQ = DQ+SQ+DQ
        quotable = set(string.whitespace+'"$`\\')

        if CM not in arg or not listmode:
            if SQ not in arg:
                # If any of the special characters "$`\ or whitespace are present, single-quote.
                # Otherwise return bare.
                if set(arg).intersection(quotable):
                    return SQ+arg+SQ
                else:
                    return arg # Easy case: no splits, no quoting.
            else:
                # There are single quotes. Double-quote them, and single-quote the
                # strings between them.
                l = [cls._shield_nvcc_list_arg(s) for s in arg.split(SQ)]
                l = sum([[s, DQSQ] for s in l][:-1], [])  # Interleave l with DQSQs
                return ''.join(l)
        else:
            # A comma is present, and list mode was active.
            # We apply (what we guess is) the (primitive) NVCC splitting rule:
            l = ['']
            instring = False
            argit = iter(arg)
            for c in argit:
                if   c == CM and not instring:
                    l.append('')
                elif c == DQ:
                    l[-1] += c
                    instring = not instring
                elif c == BS:
                    try:
                        l[-1] += next(argit)
                    except StopIteration:
                        break
                else:
                    l[-1] += c

            # Shield individual strings, without listmode, then return them with
            # escaped commas between them.
            l = [cls._shield_nvcc_list_arg(s, listmode=False) for s in l]
            return r'\,'.join(l)

    @classmethod
    def _merge_flags(cls, flags: T.List[str]) -> T.List[str]:
        r"""
        The flags to NVCC gets exceedingly verbose and unreadable when too many of them
        are shielded with -Xcompiler. Merge consecutive -Xcompiler-wrapped arguments
        into one.
        """
        if len(flags) <= 1:
            return flags
        flagit = iter(flags)
        xflags = []

        def is_xcompiler_flag_isolated(flag: str) -> bool:
            return flag == '-Xcompiler'

        def is_xcompiler_flag_glued(flag: str) -> bool:
            return flag.startswith('-Xcompiler=')

        def is_xcompiler_flag(flag: str) -> bool:
            return is_xcompiler_flag_isolated(flag) or is_xcompiler_flag_glued(flag)

        def get_xcompiler_val(flag: str, flagit: T.Iterator[str]) -> str:
            if is_xcompiler_flag_glued(flag):
                return flag[len('-Xcompiler='):]
            else:
                try:
                    return next(flagit)
                except StopIteration:
                    return ""

        ingroup = False
        for flag in flagit:
            if not is_xcompiler_flag(flag):
                ingroup = False
                xflags.append(flag)
            elif ingroup:
                xflags[-1] += ','
                xflags[-1] += get_xcompiler_val(flag, flagit)
            elif is_xcompiler_flag_isolated(flag):
                ingroup = True
                xflags.append(flag)
                xflags.append(get_xcompiler_val(flag, flagit))
            elif is_xcompiler_flag_glued(flag):
                ingroup = True
                xflags.append(flag)
            else:
                raise ValueError("-Xcompiler flag merging failed, unknown argument form!")
        return xflags

    def _to_host_flags(self, flags: T.List[str], phase: _Phase = _Phase.COMPILER) -> T.List[str]:
        """
        Translate generic "GCC-speak" plus particular "NVCC-speak" flags to NVCC flags.

        NVCC's "short" flags have broad similarities to the GCC standard, but have
        gratuitous, irritating differences.
        """

        xflags = []
        flagit = iter(flags)

        for flag in flagit:
            # The CUDA Toolkit Documentation, in 4.1. Command Option Types and Notation,
            # specifies that NVCC does not parse the standard flags as GCC does. It has
            # its own strategy, to wit:
            #
            #     nvcc recognizes three types of command options: boolean options, single
            #     value options, and list options.
            #
            #     Boolean options do not have an argument; they are either specified on a
            #     command line or not. Single value options must be specified at most once,
            #     and list options may be repeated. Examples of each of these option types
            #     are, respectively: --verbose (switch to verbose mode), --output-file
            #     (specify output file), and --include-path (specify include path).
            #
            #     Single value options and list options must have arguments, which must
            #     follow the name of the option itself by either one of more spaces or an
            #     equals character. When a one-character short name such as -I, -l, and -L
            #     is used, the value of the option may also immediately follow the option
            #     itself without being separated by spaces or an equal character. The
            #     individual values of list options may be separated by commas in a single
            #     instance of the option, or the option may be repeated, or any
            #     combination of these two cases.
            #
            # One strange consequence of this choice is that directory and filenames that
            # contain commas (',') cannot be passed to NVCC (at least, not as easily as
            # in GCC). Another strange consequence is that it is legal to supply flags
            # such as
            #
            #     -lpthread,rt,dl,util
            #     -l pthread,rt,dl,util
            #     -l=pthread,rt,dl,util
            #
            # and each of the above alternatives is equivalent to GCC-speak
            #
            #     -lpthread -lrt -ldl -lutil
            #     -l pthread -l rt -l dl -l util
            #     -l=pthread -l=rt -l=dl -l=util
            #
            # *With the exception of commas in the name*, GCC-speak for these list flags
            # is a strict subset of NVCC-speak, so we passthrough those flags.
            #
            # The -D macro-define flag is documented as somehow shielding commas from
            # splitting a definition. Balanced parentheses, braces and single-quotes
            # around the comma are not sufficient, but balanced double-quotes are. The
            # shielding appears to work with -l, -I, -L flags as well, for instance.
            #
            # Since our goal is to replicate GCC-speak as much as possible, we check for
            # commas in all list-arguments and shield them with double-quotes. We make
            # an exception for -D (where this would be value-changing) and -U (because
            # it isn't possible to define a macro with a comma in the name).

            if flag in self._FLAG_PASSTHRU_NOARGS:
                xflags.append(flag)
                continue

            # Handle breakup of flag-values into a flag-part and value-part.
            if   flag[:1] not in '-/':
                # This is not a flag. It's probably a file input. Pass it through.
                xflags.append(flag)
                continue
            elif flag[:1] == '/':
                # This is ambiguously either an MVSC-style /switch or an absolute path
                # to a file. For some magical reason the following works acceptably in
                # both cases.
                wrap = '"' if ',' in flag else ''
                xflags.append(f'-X{phase.value}={wrap}{flag}{wrap}')
                continue
            elif len(flag) >= 2 and flag[0] == '-' and flag[1] in 'IDULlmOxmte':
                # This is a single-letter short option. These options (with the
                # exception of -o) are allowed to receive their argument with neither
                # space nor = sign before them. Detect and separate them in that event.
                if   flag[2:3] == '':            # -I something
                    try:
                        val = next(flagit)
                    except StopIteration:
                        pass
                elif flag[2:3] == '=':           # -I=something
                    val = flag[3:]
                else:                            # -Isomething
                    val = flag[2:]
                flag = flag[:2]                  # -I
            elif flag in self._FLAG_LONG2SHORT_WITHARGS or \
                 flag in self._FLAG_SHORT2LONG_WITHARGS:
                # This is either -o or a multi-letter flag, and it is receiving its
                # value isolated.
                try:
                    val = next(flagit)           # -o something
                except StopIteration:
                    pass
            elif flag.split('=', 1)[0] in self._FLAG_LONG2SHORT_WITHARGS or \
                 flag.split('=', 1)[0] in self._FLAG_SHORT2LONG_WITHARGS:
                # This is either -o or a multi-letter flag, and it is receiving its
                # value after an = sign.
                flag, val = flag.split('=', 1)    # -o=something
            # Some dependencies (e.g., BoostDependency) add unspaced "-isystem/usr/include" arguments
            elif flag.startswith('-isystem'):
                val = flag[8:].strip()
                flag = flag[:8]
            else:
                # This is a flag, and it's foreign to NVCC.
                #
                # We do not know whether this GCC-speak flag takes an isolated
                # argument. Assuming it does not (the vast majority indeed don't),
                # wrap this argument in an -Xcompiler flag and send it down to NVCC.
                if   flag == '-ffast-math':
                    xflags.append('-use_fast_math')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-fno-fast-math':
                    xflags.append('-ftz=false')
                    xflags.append('-prec-div=true')
                    xflags.append('-prec-sqrt=true')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-freciprocal-math':
                    xflags.append('-prec-div=false')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-fno-reciprocal-math':
                    xflags.append('-prec-div=true')
                    xflags.append('-Xcompiler='+flag)
                else:
                    xflags.append('-Xcompiler='+self._shield_nvcc_list_arg(flag))
                    # The above should securely handle GCC's -Wl, -Wa, -Wp, arguments.
                continue

            assert val is not None  # Should only trip if there is a missing argument.

            # Take care of the various NVCC-supported flags that need special handling.
            flag = self._FLAG_LONG2SHORT_WITHARGS.get(flag, flag)

            if   flag in {'-include', '-isystem', '-I', '-L', '-l'}:
                # These flags are known to GCC, but list-valued in NVCC. They potentially
                # require double-quoting to prevent NVCC interpreting the flags as lists
                # when GCC would not have done so.
                #
                # We avoid doing this quoting for -D to avoid redefining macros and for
                # -U because it isn't possible to define a macro with a comma in the name.
                # -U with comma arguments is impossible in GCC-speak (and thus unambiguous
                #in NVCC-speak, albeit unportable).
                if len(flag) == 2:
                    xflags.append(flag+self._shield_nvcc_list_arg(val))
                elif flag == '-isystem' and val in self.host_compiler.get_default_include_dirs():
                    # like GnuLikeCompiler, we have to filter out include directories specified
                    # with -isystem that overlap with the host compiler's search path
                    pass
                else:
                    xflags.append(flag)
                    xflags.append(self._shield_nvcc_list_arg(val))
            elif flag == '-O':
                # Handle optimization levels GCC knows about that NVCC does not.
                if   val == 'fast':
                    xflags.append('-O3')
                    xflags.append('-use_fast_math')
                    xflags.append('-Xcompiler')
                    xflags.append(flag+val)
                elif val in {'s', 'g', 'z'}:
                    xflags.append('-Xcompiler')
                    xflags.append(flag+val)
                else:
                    xflags.append(flag+val)
            elif flag in {'-D', '-U', '-m', '-t'}:
                xflags.append(flag+val)       # For style, keep glued.
            elif flag in {'-std'}:
                xflags.append(flag+'='+val)   # For style, keep glued.
            else:
                xflags.append(flag)
                xflags.append(val)

        return self._merge_flags(xflags)

    def needs_static_linker(self) -> bool:
        return False

    def thread_link_flags(self, environment: 'Environment') -> T.List[str]:
        return self._to_host_flags(self.host_compiler.thread_link_flags(environment), _Phase.LINKER)

    def sanity_check(self, work_dir: str, env: 'Environment') -> None:
        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))

        sname = 'sanitycheckcuda.cu'
        code = r'''
        #include <cuda_runtime.h>
        #include <stdio.h>

        __global__ void kernel (void) {}

        int main(void){
            struct cudaDeviceProp prop;
            int count, i;
            cudaError_t ret = cudaGetDeviceCount(&count);
            if(ret != cudaSuccess){
                fprintf(stderr, "%d\n", (int)ret);
            }else{
                for(i=0;i<count;i++){
                    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess){
                        fprintf(stdout, "%d.%d\n", prop.major, prop.minor);
                    }
                }
            }
            fflush(stderr);
            fflush(stdout);
            return 0;
        }
        '''
        binname = sname.rsplit('.', 1)[0]
        binname += '_cross' if self.is_cross else ''
        source_name = os.path.join(work_dir, sname)
        binary_name = os.path.join(work_dir, binname + '.exe')
        with open(source_name, 'w', encoding='utf-8') as ofile:
            ofile.write(code)

        # The Sanity Test for CUDA language will serve as both a sanity test
        # and a native-build GPU architecture detection test, useful later.
        #
        # For this second purpose, NVCC has very handy flags, --run and
        # --run-args, that allow one to run an application with the
        # environment set up properly. Of course, this only works for native
        # builds; For cross builds we must still use the exe_wrapper (if any).
        self.detected_cc = ''
        flags = []

        # Disable warnings, compile with statically-linked runtime for minimum
        # reliance on the system.
        flags += ['-w', '-cudart', 'static', source_name]

        # Use the -ccbin option, if available, even during sanity checking.
        # Otherwise, on systems where CUDA does not support the default compiler,
        # NVCC becomes unusable.
        flags += self.get_ccbin_args(env.coredata.options)

        # If cross-compiling, we can't run the sanity check, only compile it.
        if self.is_cross and self.exe_wrapper is None:
            # Linking cross built apps is painful. You can't really
            # tell if you should use -nostdlib or not and for example
            # on OSX the compiler binary is the same but you need
            # a ton of compiler flags to differentiate between
            # arm and x86_64. So just compile.
            flags += self.get_compile_only_args()
        flags += self.get_output_args(binary_name)

        # Compile sanity check
        cmdlist = self.exelist + flags
        mlog.debug('Sanity check compiler command line: ', ' '.join(cmdlist))
        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check compile stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check compile stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        if pc.returncode != 0:
            raise EnvironmentException(f'Compiler {self.name_string()} can not compile programs.')

        # Run sanity check (if possible)
        if self.is_cross:
            if self.exe_wrapper is None:
                return
            else:
                cmdlist = self.exe_wrapper.get_command() + [binary_name]
        else:
            cmdlist = self.exelist + ['--run', '"' + binary_name + '"']
        mlog.debug('Sanity check run command line: ', ' '.join(cmdlist))
        pe, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check run stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check run stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        pe.wait()
        if pe.returncode != 0:
            raise EnvironmentException(f'Executables created by {self.language} compiler {self.name_string()} are not runnable.')

        # Interpret the result of the sanity test.
        # As mentioned above, it is not only a sanity test but also a GPU
        # architecture detection test.
        if stde == '':
            self.detected_cc = stdo
        else:
            mlog.debug('cudaGetDeviceCount() returned ' + stde)

    def has_header_symbol(self, hname: str, symbol: str, prefix: str,
                          env: 'Environment', *,
                          extra_args: T.Union[None, T.List[str], T.Callable[[CompileCheckMode], T.List[str]]] = None,
                          dependencies: T.Optional[T.List['Dependency']] = None) -> T.Tuple[bool, bool]:
        if extra_args is None:
            extra_args = []
        fargs = {'prefix': prefix, 'header': hname, 'symbol': symbol}
        # Check if it's a C-like symbol
        t = '''{prefix}
        #include <{header}>
        int main(void) {{
            /* If it's not defined as a macro, try to use as a symbol */
            #ifndef {symbol}
                {symbol};
            #endif
            return 0;
        }}'''
        found, cached = self.compiles(t.format_map(fargs), env, extra_args=extra_args, dependencies=dependencies)
        if found:
            return True, cached
        # Check if it's a class or a template
        t = '''{prefix}
        #include <{header}>
        using {symbol};
        int main(void) {{
            return 0;
        }}'''
        return self.compiles(t.format_map(fargs), env, extra_args=extra_args, dependencies=dependencies)

    def get_options(self) -> 'MutableKeyedOptionDictType':
        opts = super().get_options()
        std_key      = OptionKey('std',      machine=self.for_machine, lang=self.language)
        ccbindir_key = OptionKey('ccbindir', machine=self.for_machine, lang=self.language)
        opts.update({
            std_key:      coredata.UserComboOption('C++ language standard to use with CUDA',
                                                   ['none', 'c++03', 'c++11', 'c++14', 'c++17'], 'none'),
            ccbindir_key: coredata.UserStringOption('CUDA non-default toolchain directory to use (-ccbin)',
                                                    ''),
        })
        return opts

    def _to_host_compiler_options(self, options: 'KeyedOptionDictType') -> 'KeyedOptionDictType':
        """
        Convert an NVCC Option set to a host compiler's option set.
        """

        # We must strip the -std option from the host compiler option set, as NVCC has
        # its own -std flag that may not agree with the host compiler's.
        host_options = {key: options.get(key, opt) for key, opt in self.host_compiler.get_options().items()}
        std_key = OptionKey('std', machine=self.for_machine, lang=self.host_compiler.language)
        overrides = {std_key: 'none'}
        return OptionOverrideProxy(overrides, host_options)

    def get_option_compile_args(self, options: 'KeyedOptionDictType') -> T.List[str]:
        args = self.get_ccbin_args(options)
        # On Windows, the version of the C++ standard used by nvcc is dictated by
        # the combination of CUDA version and MSVC version; the --std= is thus ignored
        # and attempting to use it will result in a warning: https://stackoverflow.com/a/51272091/741027
        if not is_windows():
            key = OptionKey('std', machine=self.for_machine, lang=self.language)
            std = options[key]
            if std.value != 'none':
                args.append('--std=' + std.value)

        return args + self._to_host_flags(self.host_compiler.get_option_compile_args(self._to_host_compiler_options(options)))

    def get_option_link_args(self, options: 'KeyedOptionDictType') -> T.List[str]:
        args = self.get_ccbin_args(options)
        return args + self._to_host_flags(self.host_compiler.get_option_link_args(self._to_host_compiler_options(options)), _Phase.LINKER)

    def get_soname_args(self, env: 'Environment', prefix: str, shlib_name: str,
                        suffix: str, soversion: str,
                        darwin_versions: T.Tuple[str, str]) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_soname_args(
            env, prefix, shlib_name, suffix, soversion, darwin_versions), _Phase.LINKER)

    def get_compile_only_args(self) -> T.List[str]:
        return ['-c']

    def get_no_optimization_args(self) -> T.List[str]:
        return ['-O0']

    def get_optimization_args(self, optimization_level: str) -> T.List[str]:
        # alternatively, consider simply redirecting this to the host compiler, which would
        # give us more control over options like "optimize for space" (which nvcc doesn't support):
        # return self._to_host_flags(self.host_compiler.get_optimization_args(optimization_level))
        return cuda_optimization_args[optimization_level]

    def sanitizer_compile_args(self, value: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.sanitizer_compile_args(value))

    def sanitizer_link_args(self, value: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.sanitizer_link_args(value))

    def get_debug_args(self, is_debug: bool) -> T.List[str]:
        return cuda_debug_args[is_debug]

    def get_werror_args(self) -> T.List[str]:
        return ['-Werror=cross-execution-space-call,deprecated-declarations,reorder']

    def get_warn_args(self, level: str) -> T.List[str]:
        return self.warn_args[level]

    def get_buildtype_args(self, buildtype: str) -> T.List[str]:
        # nvcc doesn't support msvc's "Edit and Continue" PDB format; "downgrade" to
        # a regular PDB to avoid cl's warning to that effect (D9025 : overriding '/ZI' with '/Zi')
        host_args = ['/Zi' if arg == '/ZI' else arg for arg in self.host_compiler.get_buildtype_args(buildtype)]
        return cuda_buildtype_args[buildtype] + self._to_host_flags(host_args)

    def get_include_args(self, path: str, is_system: bool) -> T.List[str]:
        if path == '':
            path = '.'
        return ['-isystem=' + path] if is_system else ['-I' + path]

    def get_compile_debugfile_args(self, rel_obj: str, pch: bool = False) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_compile_debugfile_args(rel_obj, pch))

    def get_link_debugfile_args(self, targetfile: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_link_debugfile_args(targetfile), _Phase.LINKER)

    def get_depfile_suffix(self) -> str:
        return 'd'

    def get_buildtype_linker_args(self, buildtype: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_buildtype_linker_args(buildtype), _Phase.LINKER)

    def build_rpath_args(self, env: 'Environment', build_dir: str, from_dir: str,
                         rpath_paths: T.Tuple[str, ...], build_rpath: str,
                         install_rpath: str) -> T.Tuple[T.List[str], T.Set[bytes]]:
        (rpath_args, rpath_dirs_to_remove) = self.host_compiler.build_rpath_args(
            env, build_dir, from_dir, rpath_paths, build_rpath, install_rpath)
        return (self._to_host_flags(rpath_args, _Phase.LINKER), rpath_dirs_to_remove)

    def linker_to_compiler_args(self, args: T.List[str]) -> T.List[str]:
        return args

    def get_pic_args(self) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_pic_args())

    def compute_parameters_with_absolute_paths(self, parameter_list: T.List[str],
                                               build_dir: str) -> T.List[str]:
        return []

    def get_output_args(self, target: str) -> T.List[str]:
        return ['-o', target]

    def get_std_exe_link_args(self) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_std_exe_link_args(), _Phase.LINKER)

    def find_library(self, libname: str, env: 'Environment', extra_dirs: T.List[str],
                     libtype: LibType = LibType.PREFER_SHARED) -> T.Optional[T.List[str]]:
        return ['-l' + libname] # FIXME

    def get_crt_compile_args(self, crt_val: str, buildtype: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_crt_compile_args(crt_val, buildtype))

    def get_crt_link_args(self, crt_val: str, buildtype: str) -> T.List[str]:
        # nvcc defaults to static, release version of msvc runtime and provides no
        # native option to override it; override it with /NODEFAULTLIB
        host_link_arg_overrides = []
        host_crt_compile_args = self.host_compiler.get_crt_compile_args(crt_val, buildtype)
        if any(arg in ['/MDd', '/MD', '/MTd'] for arg in host_crt_compile_args):
            host_link_arg_overrides += ['/NODEFAULTLIB:LIBCMT.lib']
        return self._to_host_flags(host_link_arg_overrides + self.host_compiler.get_crt_link_args(crt_val, buildtype), _Phase.LINKER)

    def get_target_link_args(self, target: 'BuildTarget') -> T.List[str]:
        return self._to_host_flags(super().get_target_link_args(target), _Phase.LINKER)

    def get_dependency_compile_args(self, dep: 'Dependency') -> T.List[str]:
        return self._to_host_flags(super().get_dependency_compile_args(dep))

    def get_dependency_link_args(self, dep: 'Dependency') -> T.List[str]:
        return self._to_host_flags(super().get_dependency_link_args(dep), _Phase.LINKER)

    def get_ccbin_args(self, options: 'KeyedOptionDictType') -> T.List[str]:
        key = OptionKey('ccbindir', machine=self.for_machine, lang=self.language)
        ccbindir = options[key].value
        if isinstance(ccbindir, str) and ccbindir != '':
            return [self._shield_nvcc_list_arg('-ccbin='+ccbindir, False)]
        else:
            return []

    def get_profile_generate_args(self) -> T.List[str]:
        return ['-Xcompiler=' + x for x in self.host_compiler.get_profile_generate_args()]

    def get_profile_use_args(self) -> T.List[str]:
        return ['-Xcompiler=' + x for x in self.host_compiler.get_profile_use_args()]

    def get_disable_assert_args(self) -> T.List[str]:
        return self.host_compiler.get_disable_assert_args()