[libc] Rework the GPU build to be a regular target (#81921)

Summary: This is a massive patch because it reworks the entire build and everything that depends on it. This is not split up because various bots would fail otherwise. I will attempt to describe the necessary changes here. This patch completely reworks how the GPU build is built and targeted. Previously, we used a standard runtimes build and handled both NVPTX and AMDGPU in a single build via multi-targeting. This added a lot of divergence in the build system and prevented us from doing various things like building for the CPU / GPU at the same time, or exporting the startup libraries or running tests without a full rebuild. The new appraoch is to handle the GPU builds as strict cross-compiling runtimes. The first step required https://github.com/llvm/llvm-project/pull/81557 to allow the `LIBC` target to build for the GPU without touching the other targets. This means that the GPU uses all the same handling as the other builds in `libc`. The new expected way to build the GPU libc is with `LLVM_LIBC_RUNTIME_TARGETS=amdgcn-amd-amdhsa;nvptx64-nvidia-cuda`. The second step was reworking how we generated the embedded GPU library by moving it into the library install step. Where we previously had one `libcgpu.a` we now have `libcgpu-amdgpu.a` and `libcgpu-nvptx.a`. This patch includes the necessary clang / OpenMP changes to make that not break the bots when this lands. We unfortunately still require that the NVPTX target has an `internal` target for tests. This is because the NVPTX target needs to do LTO for the provided version (The offloading toolchain can handle it) but cannot use it for the native toolchain which is used for making tests. This approach is vastly superior in every way, allowing us to treat the GPU as a standard cross-compiling target. We can now install the GPU utilities to do things like use the offload tests and other fun things. Some certain utilities need to be built with `--target=${LLVM_HOST_TRIPLE}` as well. I think this is a fine workaround as we will always assume that the GPU `libc` is a cross-build with a functioning host. Depends on https://github.com/llvm/llvm-project/pull/81557
author: Joseph Huber <huberjn@outlook.com> 2024-02-22 15:29:29 -0600
committer: GitHub <noreply@github.com> 2024-02-22 15:29:29 -0600
commit: 47b7c91abe7af3133a591aa2e73fffa30826f986 (patch)
tree: 43e35c3d4c203b54fb5882f10c655e7d49ceed6c /libc/cmake
parent: 45fe67dd61a6ac7df84d3a586e41c36a4767757f (diff)
download: llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.zip
llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.tar.gz
llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.tar.bz2
8 files changed, 297 insertions, 455 deletions
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index 623ed77..0dbc59a 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -6,18 +6,6 @@
 # platform.
 # ------------------------------------------------------------------------------
 
-if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
-  # We set the generic target and OS to "gpu" here. More specific defintions
-  # for the exact target GPU are set up in prepare_libc_gpu_build.cmake.
-  set(LIBC_TARGET_OS "gpu")
-  set(LIBC_TARGET_ARCHITECTURE_IS_GPU TRUE)
-  set(LIBC_TARGET_ARCHITECTURE "gpu")
-  if(LIBC_TARGET_TRIPLE)
-    message(WARNING "LIBC_TARGET_TRIPLE is ignored as LIBC_GPU_BUILD is on. ")
-  endif()
-  return()
-endif()
-
 if(MSVC)
   # If the compiler is visual c++ or equivalent, we will assume a host build.
   set(LIBC_TARGET_OS ${CMAKE_HOST_SYSTEM_NAME})
@@ -59,6 +47,10 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_arch "riscv32")
   elseif(target_arch MATCHES "^riscv64")
     set(target_arch "riscv64")
+  elseif(target_arch MATCHES "^amdgcn")
+    set(target_arch "amdgpu")
+  elseif(target_arch MATCHES "^nvptx64")
+    set(target_arch "nvptx")
   else()
     return()
   endif()
@@ -75,6 +67,12 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_sys "darwin")
   endif()
 
+  # Setting OS name for GPU architectures.
+  list(GET triple_comps -1 gpu_target_sys)
+  if(gpu_target_sys MATCHES "^amdhsa" OR gpu_target_sys MATCHES "^cuda")
+    set(target_sys "gpu")
+  endif()
+
   set(${sys_var} ${target_sys} PARENT_SCOPE)
 endfunction(get_arch_and_system_from_triple)
 
@@ -156,6 +154,10 @@ elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv64")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv32")
   set(LIBC_TARGET_ARCHITECTURE_IS_RISCV32 TRUE)
   set(LIBC_TARGET_ARCHITECTURE "riscv")
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "amdgpu")
+  set(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "nvptx")
+  set(LIBC_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target architecture ${LIBC_TARGET_ARCHITECTURE}")
@@ -178,6 +180,8 @@ elseif(LIBC_TARGET_OS STREQUAL "darwin")
   set(LIBC_TARGET_OS_IS_DARWIN TRUE)
 elseif(LIBC_TARGET_OS STREQUAL "windows")
   set(LIBC_TARGET_OS_IS_WINDOWS TRUE)
+elseif(LIBC_TARGET_OS STREQUAL "gpu")
+  set(LIBC_TARGET_OS_IS_GPU TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target operating system ${LIBC_TARGET_OS}")
diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
index 9e361f5..bbaeb9f 100644
--- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
@@ -2,7 +2,7 @@ set(LLVM_LIBC_MPFR_INSTALL_PATH "" CACHE PATH "Path to where MPFR is installed (
 
 if(LLVM_LIBC_MPFR_INSTALL_PATH)
   set(LIBC_TESTS_CAN_USE_MPFR TRUE)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   set(LIBC_TESTS_CAN_USE_MPFR FALSE)
 else()
   try_compile(
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 140e4d5..33ba5da 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -82,10 +82,22 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "/EHs-c-")
     list(APPEND compile_options "/GR-")
   endif()
-  if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if (LIBC_TARGET_OS_IS_GPU)
     list(APPEND compile_options "-nogpulib")
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
+    list(APPEND compile_options "-flto")
+
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      list(APPEND compile_options "-Wno-unknown-cuda-version")
+      list(APPEND compile_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
+      list(APPEND compile_options "--cuda-feature=+ptx63")
+      if(LIBC_CUDA_ROOT)
+        list(APPEND compile_options "--cuda-path=${LIBC_CUDA_ROOT}")
+      endif()
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
+    endif()
 
     # Manually disable all standard include paths and include the resource
     # directory to prevent system headers from being included.
@@ -138,73 +150,21 @@ function(_get_common_test_compile_options output_var flags)
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
-# Obtains NVPTX specific arguments for compilation.
-# The PTX feature is primarily based on the CUDA toolchain version. We want to
-# be able to target NVPTX without an existing CUDA installation, so we need to
-# set this manually. This simply sets the PTX feature to the minimum required
-# for the features we wish to use on that target. The minimum PTX features used
-# here roughly corresponds to the CUDA 9.0 release.
-# Adjust as needed for desired PTX features.
-function(get_nvptx_compile_options output_var gpu_arch)
-  set(nvptx_options "")
-  list(APPEND nvptx_options "-march=${gpu_arch}")
-  list(APPEND nvptx_options "-Wno-unknown-cuda-version")
-  list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
-  if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_53")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_60")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_61")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_62")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_70")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_72")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_75")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_80")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_86")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_89")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_90")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  else()
-    message(FATAL_ERROR "Unknown Nvidia GPU architecture '${gpu_arch}'")
-  endif()
-
-  if(LIBC_CUDA_ROOT)
-    list(APPEND nvptx_options "--cuda-path=${LIBC_CUDA_ROOT}")
-  endif()
-  set(${output_var} ${nvptx_options} PARENT_SCOPE)
-endfunction()
-
 function(_get_hermetic_test_compile_options output_var flags)
   _get_compile_options_from_flags(compile_flags ${flags})
   list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
        ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
 
   # The GPU build requires overriding the default CMake triple and architecture.
-  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     list(APPEND compile_options
          -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-         --target=${LIBC_GPU_TARGET_TRIPLE}
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
-  elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-    get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
-         -nogpulib ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE})
+         "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
+         --cuda-path=${LIBC_CUDA_ROOT}
+         -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 9e9b598..19515b1 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -139,7 +139,7 @@ function(add_gen_header target_name)
             ${hdrgen_deps}
   )
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 81c207e..f15ffd5 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -50,31 +50,9 @@ function(collect_object_file_deps target result)
   endif()
 endfunction(collect_object_file_deps)
 
-# A rule to build a library from a collection of entrypoint objects.
-# Usage:
-#     add_entrypoint_library(
-#       DEPENDS <list of add_entrypoint_object targets>
-#     )
-#
-# NOTE: If one wants an entrypoint to be available in a library, then they will
-# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
-# entrypoint dependencies will not be added to the library.
-function(add_entrypoint_library target_name)
-  cmake_parse_arguments(
-    "ENTRYPOINT_LIBRARY"
-    "" # No optional arguments
-    "" # No single value arguments
-    "DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
-    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
-                        "of 'add_entrypoint_object' targets.")
-  endif()
-
-  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+function(get_all_object_file_deps result fq_deps_list)
   set(all_deps "")
-  foreach(dep IN LISTS fq_deps_list)
+  foreach(dep ${fq_deps_list})
     get_target_property(dep_type ${dep} "TARGET_TYPE")
     if(NOT ((${dep_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE}) OR
             (${dep_type} STREQUAL ${ENTRYPOINT_EXT_TARGET_TYPE}) OR
@@ -102,6 +80,121 @@ function(add_entrypoint_library target_name)
     list(APPEND all_deps ${entrypoint_target})
   endforeach(dep)
   list(REMOVE_DUPLICATES all_deps)
+  set(${result} ${all_deps} PARENT_SCOPE)
+endfunction()
+
+# A rule to build a library from a collection of entrypoint objects and bundle
+# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
+# Usage:
+#     add_gpu_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+function(add_gpu_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
+  # The GPU 'libc' needs to be exported in a format that can be linked with
+  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
+  # fat binary and adds them to a static library.
+  set(objects "")
+  foreach(dep IN LISTS all_deps)
+    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
+    string(FIND ${dep} "." last_dot_loc REVERSE)
+    math(EXPR name_loc "${last_dot_loc} + 1")
+    string(SUBSTRING ${dep} ${name_loc} -1 name)
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
+    endif()
+
+    # Use the 'clang-offload-packager' to merge these files into a binary blob.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
+      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+              "${prefix},file=$<JOIN:${object},,file=>" -o 
+              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
+      DEPENDS ${dep}
+      COMMENT "Packaging LLVM offloading binary for '${object}'"
+    )
+    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
+                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+
+    # CMake does not permit setting the name on object files. In order to have
+    # human readable names we create an empty stub file with the entrypoint
+    # name. This empty file will then have the created binary blob embedded.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
+      DEPENDS ${dep} ${dep}.__gpubin__
+    )
+    add_custom_target(${dep}.__stub__ 
+                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
+
+    add_library(${dep}.__fatbin__
+      EXCLUDE_FROM_ALL OBJECT
+      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+    )
+
+    # This is always compiled for the LLVM host triple instead of the native GPU
+    # triple that is used by default in the build. 
+    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
+    target_compile_options(${dep}.__fatbin__ PRIVATE 
+      --target=${LLVM_HOST_TRIPLE}
+      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+    add_dependencies(${dep}.__fatbin__ ${dep} ${dep}.__stub__ ${dep}.__gpubin__)
+
+    # Set the list of newly create fat binaries containing embedded device code.
+    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
+  endforeach()
+
+  add_library(
+    ${target_name}
+    STATIC
+      ${objects}
+  )
+  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
+endfunction(add_gpu_entrypoint_library)
+
+# A rule to build a library from a collection of entrypoint objects.
+# Usage:
+#     add_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+#
+# NOTE: If one wants an entrypoint to be available in a library, then they will
+# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
+# entrypoint dependencies will not be added to the library.
+function(add_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
   set(objects "")
   foreach(dep IN LISTS all_deps)
     list(APPEND objects $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 308ba7d..78536f4 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -1,175 +1,5 @@
 set(OBJECT_LIBRARY_TARGET_TYPE "OBJECT_LIBRARY")
 
-# Build the object target for a single GPU arch.
-# Usage:
-#     _build_gpu_object_for_single_arch(
-#       <target_name>
-#       <gpu_arch>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  set(compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS})
-  # Derive the triple from the specified architecture.
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    set(gpu_target_triple ${AMDGPU_TARGET_TRIPLE})
-    list(APPEND compile_options "-mcpu=${gpu_arch}")
-    list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
-    list(APPEND compile_options "-emit-llvm")
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    set(gpu_target_triple ${NVPTX_TARGET_TRIPLE})
-    get_nvptx_compile_options(nvptx_options ${gpu_arch})
-    list(APPEND compile_options "${nvptx_options}")
-  else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
-  endif()
-  list(APPEND compile_options "--target=${gpu_target_triple}")
-
-  # Build the library for this target architecture. We always emit LLVM-IR for
-  # packaged GPU binaries.
-  add_library(${fq_target_name}
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_GPU_OBJ_SRCS}
-    ${ADD_GPU_OBJ_HDRS}
-  )
-
-  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  set_target_properties(${fq_target_name} PROPERTIES CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD})
-  if(ADD_GPU_OBJ_DEPENDS)
-    add_dependencies(${fq_target_name} ${ADD_GPU_OBJ_DEPENDS})
-    set_target_properties(${fq_target_name} PROPERTIES DEPS "${ADD_GPU_OBJ_DEPENDS}")
-  endif()
-endfunction(_build_gpu_object_for_single_arch)
-
-# Build the object target for the GPU.
-# This compiles the target for all supported architectures and embeds it into
-# host binary for installing.
-# Usage:
-#     _build_gpu_object_bundle(
-#       <target_name>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_bundle fq_target_name)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  foreach(add_gpu_obj_src ${ADD_GPU_OBJ_SRCS})
-    # The packaged version will be built for every target GPU architecture. We do
-    # this so we can support multiple accelerators on the same machine.
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      get_filename_component(src_name ${add_gpu_obj_src} NAME)
-      set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_arch})
-
-      _build_gpu_object_for_single_arch(
-        ${gpu_target_name}
-        ${gpu_arch}
-        CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}
-        HDRS ${ADD_GPU_OBJ_HDRS}
-        SRCS ${add_gpu_obj_src}
-        COMPILE_OPTIONS
-          ${ADD_GPU_OBJ_COMPILE_OPTIONS}
-          "-emit-llvm"
-        DEPENDS ${ADD_GPU_OBJ_DEPENDS}
-      )
-      # Append this target to a list of images to package into a single binary.
-      set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
-      if("${gpu_arch}" IN_LIST all_nvptx_architectures)
-        get_nvptx_compile_options(nvptx_options ${gpu_arch})
-        string(REGEX MATCH "\\+ptx[0-9]+" nvptx_ptx_feature ${nvptx_options})
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature})
-      else()
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${AMDGPU_TARGET_TRIPLE})
-       endif()
-      list(APPEND gpu_target_objects ${input_file})
-    endforeach()
-
-    # After building the target for the desired GPUs we must package the output
-    # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
-    # more information.
-    set(packaged_target_name ${fq_target_name}.${src_name}.__gpu__)
-    set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.${src_name}.gpubin)
-
-    add_custom_command(OUTPUT ${packaged_output_name}
-                       COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-                               ${packager_images} -o ${packaged_output_name}
-                       DEPENDS ${gpu_target_objects} ${add_gpu_obj_src} ${ADD_GPU_OBJ_HDRS}
-                       COMMENT "Packaging LLVM offloading binary")
-    add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
-    list(APPEND packaged_gpu_names ${packaged_target_name})
-    list(APPEND packaged_gpu_binaries ${packaged_output_name})
-  endforeach()
-
-  # We create an empty 'stub' file for the host to contain the embedded device
-  # code. This will be packaged into 'libcgpu.a'.
-  # TODO: In the future we will want to combine every architecture for a target
-  #       into a single bitcode file and use that. For now we simply build for
-  #       every single one and let the offloading linker handle it.
-  string(FIND ${fq_target_name} "." last_dot_loc REVERSE)
-  math(EXPR name_loc "${last_dot_loc} + 1")
-  string(SUBSTRING ${fq_target_name} ${name_loc} -1 target_name)
-  set(stub_filename "${target_name}.cpp")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}"
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs/
-    COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-    DEPENDS ${gpu_target_objects} ${ADD_GPU_OBJ_SRCS} ${ADD_GPU_OBJ_HDRS}
-  )
-  set(stub_target_name ${fq_target_name}.__stub__)
-  add_custom_target(${stub_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename})
-
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libcgpu.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE
-                         ${ADD_GPU_OBJ_COMPILE_OPTIONS} -nostdlib)
-  foreach(packaged_gpu_binary ${packaged_gpu_binaries})
-    target_compile_options(${fq_target_name} PRIVATE
-                           "SHELL:-Xclang -fembed-offload-object=${packaged_gpu_binary}")
-  endforeach()
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  add_dependencies(${fq_target_name}
-                   ${full_deps_list} ${packaged_gpu_names} ${stub_target_name})
-endfunction()
-
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -214,53 +44,37 @@ function(create_object_library fq_target_name)
     message(FATAL_ERROR "'add_object_library' rule requires SRCS to be specified.")
   endif()
 
-  # The GPU build uses a separate internal file.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-    set(internal_target_name ${fq_target_name}.__internal__)
-    set(public_packaging_for_internal "")
-  else()
-    set(internal_target_name ${fq_target_name})
-    set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
-  endif()
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
 
   _get_common_compile_options(compile_options "${ADD_OBJECT_FLAGS}")
   list(APPEND compile_options ${ADD_OBJECT_COMPILE_OPTIONS})
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    if(NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-      _build_gpu_object_bundle(
-        ${fq_target_name}
-        SRCS ${ADD_OBJECT_SRCS}
-        HDRS ${ADD_OBJECT_HDRS}
-        CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-        COMPILE_OPTIONS ${compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-        DEPENDS ${fq_deps_list}
-      )
-    endif()
-    # When the target for GPU is not bundled, internal_target_name is the same
-    # as fq_targetname
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_OBJECT_SRCS}
-      HDRS ${ADD_OBJECT_HDRS}
-      CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-      COMPILE_OPTIONS ${compile_options} ${public_packaging_for_internal}
-      DEPENDS ${fq_deps_list}
-    )
-  else()
+  add_library(
+    ${fq_target_name}
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_OBJECT_SRCS}
+    ${ADD_OBJECT_HDRS}
+  )
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+
+  # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
+  # cannot handle it natively. Make a separate internal target for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED)
     add_library(
-      ${fq_target_name}
+      ${internal_target_name}
       EXCLUDE_FROM_ALL
       OBJECT
       ${ADD_OBJECT_SRCS}
       ${ADD_OBJECT_HDRS}
     )
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_compile_options(${internal_target_name} PRIVATE ${compile_options}
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -290,13 +104,18 @@ function(create_object_library fq_target_name)
       FLAGS "${ADD_OBJECT_FLAGS}"
   )
 
+  # If we built a separate internal target we want to use those target objects
+  # for testing instead of the exported target.
+  set(target_objects ${fq_target_name})
   if(TARGET ${internal_target_name})
-    set_target_properties(
-      ${fq_target_name}
-      PROPERTIES
-        OBJECT_FILES "$<TARGET_OBJECTS:${internal_target_name}>"
-    )
+    set(target_objects ${internal_target_name})
   endif()
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      OBJECT_FILES "$<TARGET_OBJECTS:${target_objects}>"
+  )
 endfunction(create_object_library)
 
 function(add_object_library target_name)
@@ -389,12 +208,19 @@ function(create_entrypoint_object fq_target_name)
 
     get_target_property(object_file ${fq_dep_name} "OBJECT_FILE")
     get_target_property(object_file_raw ${fq_dep_name} "OBJECT_FILE_RAW")
-    add_library(
-      ${internal_target_name}
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${object_file_raw}
-    )
+
+    # If the system cannot build the GPU tests we simply make a dummy target.
+    if(LIBC_TARGET_OS_IS_GPU AND LIBC_GPU_TESTS_DISABLED)
+      add_custom_target(${internal_target_name})
+    else()
+      add_library(
+        ${internal_target_name}
+        EXCLUDE_FROM_ALL
+        OBJECT
+        ${object_file_raw}
+      )
+    endif()
+
     add_dependencies(${internal_target_name} ${fq_dep_name})
     add_library(
       ${fq_target_name}
@@ -441,60 +267,42 @@ function(create_entrypoint_object fq_target_name)
     endif()
   endif()
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    _build_gpu_object_bundle(
-      ${fq_target_name}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options}
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-  else()
-    add_library(
-      ${internal_target_name}
-      # TODO: We don't need an object library for internal consumption.
-      # A future change should switch this to a normal static library.
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${internal_target_name} ${full_deps_list})
-    target_link_libraries(${internal_target_name} ${full_deps_list})
-
-    add_library(
-      ${fq_target_name}
-      # We want an object library as the objects will eventually get packaged into
-      # an archive (like libc.a).
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${fq_target_name} ${full_deps_list})
-    target_link_libraries(${fq_target_name} ${full_deps_list})
+  add_library(
+    ${internal_target_name}
+    # TODO: We don't need an object library for internal consumption.
+    # A future change should switch this to a normal static library.
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+  target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${internal_target_name} ${full_deps_list})
+  target_link_libraries(${internal_target_name} ${full_deps_list})
+
+  # The NVPTX target cannot use LTO for the internal targets used for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    target_compile_options(${internal_target_name} PRIVATE 
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libc.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${fq_target_name} ${full_deps_list})
+  target_link_libraries(${fq_target_name} ${full_deps_list})
+
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 6ca9516..373cbd6 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -449,7 +449,7 @@ function(add_integration_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${INTEGRATION_TEST_SRCS}
     ${INTEGRATION_TEST_HDRS}
   )
@@ -461,8 +461,17 @@ function(add_integration_test test_name)
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -474,9 +483,10 @@ function(add_integration_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
-    libc.startup.${LIBC_TARGET_OS}.crt1
-    libc.test.IntegrationTest.test)
+    $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
+    libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+    libc.test.IntegrationTest.test${internal_suffix}
+  )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
                    ${INTEGRATION_TEST_DEPENDS})
@@ -495,7 +505,7 @@ function(add_integration_test test_name)
   # makes `add_custom_target` construct the correct command and execute it.
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}>
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${INTEGRATION_TEST_ARGS})
@@ -606,7 +616,7 @@ function(add_libc_hermetic_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${HERMETIC_TEST_SRCS}
     ${HERMETIC_TEST_HDRS}
   )
@@ -615,6 +625,8 @@ function(add_libc_hermetic_test test_name)
       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
       #OUTPUT_NAME ${fq_target_name}
   )
+
+  _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
@@ -629,8 +641,17 @@ function(add_libc_hermetic_test test_name)
     endif()
   endforeach()
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -642,12 +663,12 @@ function(add_libc_hermetic_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
       LibcTest.hermetic
       LibcHermeticTestSupport.hermetic
       # The NVIDIA 'nvlink' linker does not currently support static libraries.
-      $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
@@ -660,7 +681,7 @@ function(add_libc_hermetic_test test_name)
   endif()
 
   set(test_cmd ${HERMETIC_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
   add_custom_target(
     ${fq_target_name}
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 2086175..75beef8 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -1,23 +1,8 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   message(FATAL_ERROR
           "libc build: Invalid attempt to set up GPU architectures.")
 endif()
 
-# Set up the target architectures to build the GPU libc for.
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942"
-                             "gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034"
-                             "gfx1035;gfx1036"
-                             "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-set(LIBC_GPU_ARCHITECTURES "all" CACHE STRING
-    "List of GPU architectures to build the libc for.")
-set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa")
-set(NVPTX_TARGET_TRIPLE "nvptx64-nvidia-cuda")
-
 # Ensure the compiler is a valid clang when building the GPU target.
 set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
 if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
@@ -31,40 +16,6 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
-# Identify any locally installed AMD GPUs on the system using 'amdgpu-arch'.
-find_program(LIBC_AMDGPU_ARCH
-             NAMES amdgpu-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin /opt/rocm/llvm/bin/)
-
-# Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
-find_program(LIBC_NVPTX_ARCH
-             NAMES nvptx-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin)
-
-# Get the list of all natively supported GPU architectures.
-set(detected_gpu_architectures "")
-foreach(arch_tool ${LIBC_NVPTX_ARCH} ${LIBC_AMDGPU_ARCH})
-  if(arch_tool)
-    execute_process(COMMAND ${arch_tool}
-                    OUTPUT_VARIABLE arch_tool_output
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    string(REPLACE "\n" ";" arch_list "${arch_tool_output}")
-    list(APPEND detected_gpu_architectures "${arch_list}")
-  endif()
-endforeach()
-list(REMOVE_DUPLICATES detected_gpu_architectures)
-
-if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
-  set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBC_GPU_ARCHITECTURES STREQUAL "native")
-  if(NOT detected_gpu_architectures)
-    message(FATAL_ERROR "No GPUs found on the system when using 'native'")
-  endif()
-  set(LIBC_GPU_ARCHITECTURES ${detected_gpu_architectures})
-endif()
-message(STATUS "Building libc for the following GPU architecture(s): "
-               "${LIBC_GPU_ARCHITECTURES}")
-
 # Identify the program used to package multiple images into a single binary.
 find_program(LIBC_CLANG_OFFLOAD_PACKAGER
              NAMES clang-offload-packager NO_DEFAULT_PATH
@@ -87,49 +38,54 @@ else()
 endif()
 
 set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests")
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  check_cxx_compiler_flag("-nogpulib -mcpu=native" PLATFORM_HAS_GPU)
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
+  # Using 'check_cxx_compiler_flag' does not work currently due to the link job.
+  find_program(LIBC_NVPTX_ARCH
+               NAMES nvptx-arch NO_DEFAULT_PATH
+               PATHS ${LLVM_BINARY_DIR}/bin)
+  if(LIBC_NVPTX_ARCH)
+    execute_process(COMMAND ${LIBC_NVPTX_ARCH}
+                    OUTPUT_VARIABLE arch_tool_output
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(arch_tool_output MATCHES "^sm_[0-9]+")
+      set(PLATFORM_HAS_GPU TRUE)
+    endif()
+  endif()
+endif()
 
 set(gpu_test_architecture "")
 if(LIBC_GPU_TEST_ARCHITECTURE)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
   set(gpu_test_architecture ${LIBC_GPU_TEST_ARCHITECTURE})
   message(STATUS "Using user-specified GPU architecture for testing: "
                  "'${gpu_test_architecture}'")
-elseif(detected_gpu_architectures)
-  list(GET detected_gpu_architectures 0 gpu_test_architecture)
+elseif(PLATFORM_HAS_GPU)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
+  set(gpu_test_architecture "native")
   message(STATUS "Using GPU architecture detected on the system for testing: "
-                 "'${gpu_test_architecture}'")
+                 "'native'")
 else()
-  list(LENGTH LIBC_GPU_ARCHITECTURES n_gpu_archs)
-  if (${n_gpu_archs} EQUAL 1)
-    set(gpu_test_architecture ${LIBC_GPU_ARCHITECTURES})
-    message(STATUS "Using user-specified GPU architecture for testing: "
-                  "'${gpu_test_architecture}'")
-  else()
-    message(STATUS "No GPU architecture set for testing. GPU tests will not be "
-                  "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.")
-    return()
-  endif()
+  set(LIBC_GPU_TESTS_DISABLED TRUE)
+  message(STATUS "No GPU architecture detected or provided, tests will not be "
+                 "built")
 endif()
+set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
 
-if("${gpu_test_architecture}" IN_LIST all_amdgpu_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${AMDGPU_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-elseif("${gpu_test_architecture}" IN_LIST all_nvptx_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${NVPTX_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-else()
-  message(FATAL_ERROR "Unknown GPU architecture '${gpu_test_architecture}'")
-endif()
+if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # FIXME: This is a hack required to keep the CUDA package from trying to find
+  #        pthreads. We only link the CUDA driver, so this is unneeded.
+  add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
     get_filename_component(LIBC_CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
   endif()
 endif()
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   # The AMDGPU environment uses different code objects to encode the ABI for
   # kernel calls and intrinsic functions. We want to specify this manually to
   # conform to whatever the test suite was built to handle.
author	Joseph Huber <huberjn@outlook.com>	2024-02-22 15:29:29 -0600
committer	GitHub <noreply@github.com>	2024-02-22 15:29:29 -0600
commit	47b7c91abe7af3133a591aa2e73fffa30826f986 (patch)
tree	43e35c3d4c203b54fb5882f10c655e7d49ceed6c /libc/cmake
parent	45fe67dd61a6ac7df84d3a586e41c36a4767757f (diff)
download	llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.zip llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.tar.gz llvm-47b7c91abe7af3133a591aa2e73fffa30826f986.tar.bz2