diff options
author | jameshu15869 <55058507+jameshu15869@users.noreply.github.com> | 2024-06-26 16:38:39 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-26 16:38:39 -0500 |
commit | 02b57dedb73134bc81f09e2ff3c56e286091ba13 (patch) | |
tree | 93aecc9496025974a5bde255a6081621e5f7bc15 /libc | |
parent | 49e5cd2acc0d12f7cdb80aafd9ab26719d4415aa (diff) | |
download | llvm-02b57dedb73134bc81f09e2ff3c56e286091ba13.zip llvm-02b57dedb73134bc81f09e2ff3c56e286091ba13.tar.gz llvm-02b57dedb73134bc81f09e2ff3c56e286091ba13.tar.bz2 |
[libc] NVPTX Profiling (#92009)
PR for adding microbenchmarking infrastructure for NVPTX. `nvlink`
cannot perform LTO, so we cannot inline `libc` functions and this
function call overhead is not adjusted for during microbenchmarking.
Diffstat (limited to 'libc')
-rw-r--r-- | libc/CMakeLists.txt | 4 | ||||
-rw-r--r-- | libc/benchmarks/CMakeLists.txt | 10 | ||||
-rw-r--r-- | libc/benchmarks/gpu/BenchmarkLogger.cpp | 97 | ||||
-rw-r--r-- | libc/benchmarks/gpu/BenchmarkLogger.h | 27 | ||||
-rw-r--r-- | libc/benchmarks/gpu/CMakeLists.txt | 56 | ||||
-rw-r--r-- | libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 140 | ||||
-rw-r--r-- | libc/benchmarks/gpu/LibcGpuBenchmark.h | 108 | ||||
-rw-r--r-- | libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 6 | ||||
-rw-r--r-- | libc/benchmarks/gpu/src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 21 | ||||
-rw-r--r-- | libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp | 9 | ||||
-rw-r--r-- | libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp | 9 | ||||
-rw-r--r-- | libc/benchmarks/gpu/timing/CMakeLists.txt | 12 | ||||
-rw-r--r-- | libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt | 7 | ||||
-rw-r--r-- | libc/benchmarks/gpu/timing/nvptx/timing.h | 99 | ||||
-rw-r--r-- | libc/benchmarks/gpu/timing/timing.h | 22 | ||||
-rw-r--r-- | libc/cmake/modules/LLVMLibCTestRules.cmake | 27 |
17 files changed, 644 insertions, 11 deletions
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index f35471a..4ffcd55 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS) add_subdirectory(fuzzing) endif() -if(LIBC_INCLUDE_BENCHMARKS) - add_subdirectory(benchmarks) -endif() +add_subdirectory(benchmarks) if (LIBC_INCLUDE_DOCS) add_subdirectory(docs) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 4978da6..0cff6eb 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -1,3 +1,13 @@ +if(LIBC_TARGET_OS_IS_GPU) + add_subdirectory(gpu) + return() +endif() + +# The CPU build depends on Google benchmark. +if(NOT LIBC_INCLUDE_BENCHMARKS) + return() +endif() + find_package(Threads) set(LLVM_LINK_COMPONENTS diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp new file mode 100644 index 0000000..2e7e8e7 --- /dev/null +++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp @@ -0,0 +1,97 @@ +#include "benchmarks/gpu/BenchmarkLogger.h" +#include "src/__support/CPP/string.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/io.h" // write_to_stderr +#include "src/__support/big_int.h" // is_big_int +#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 +#include "src/__support/uint128.h" + +#include <stdint.h> + +namespace LIBC_NAMESPACE { +namespace benchmarks { + +// cpp::string_view specialization +template <> +BenchmarkLogger & + BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) { + LIBC_NAMESPACE::write_to_stderr(str); + return *this; +} + +// cpp::string specialization +template <> +BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) { + return *this << static_cast<cpp::string_view>(str); +} + +// const char* specialization +template <> +BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) { + return *this << cpp::string_view(str); +} + +// char* specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) { + return *this << cpp::string_view(str); +} + +// char specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) { + return *this << cpp::string_view(&ch, 1); +} + +// bool specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) { + return *this << (cond ? "true" : "false"); +} + +// void * specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) { + return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr)); +} + +template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) { + if constexpr (is_big_int_v<T> || + (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> && + (sizeof(T) > sizeof(uint64_t)))) { + static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt"); + const IntegerToString<T, radix::Hex::WithPrefix> buffer(t); + return *this << buffer.view(); + } else { + return *this << cpp::to_string(t); + } +} + +// is_integral specializations +// char is already specialized to handle character +template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short); +template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int); +template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long); +template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long); +template BenchmarkLogger & + BenchmarkLogger::operator<< <unsigned char>(unsigned char); +template BenchmarkLogger & + BenchmarkLogger::operator<< <unsigned short>(unsigned short); +template BenchmarkLogger & + BenchmarkLogger::operator<< <unsigned int>(unsigned int); +template BenchmarkLogger & + BenchmarkLogger::operator<< <unsigned long>(unsigned long); +template BenchmarkLogger & + BenchmarkLogger::operator<< <unsigned long long>(unsigned long long); + +#ifdef LIBC_TYPES_HAS_INT128 +template BenchmarkLogger & + BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); +#endif // LIBC_TYPES_HAS_INT128 +template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>); +template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>); +template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>); +template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>); + +// TODO: Add floating point formatting once it's supported by StringStream. + +BenchmarkLogger log; + +} // namespace benchmarks +} // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h new file mode 100644 index 0000000..332ff14 --- /dev/null +++ b/libc/benchmarks/gpu/BenchmarkLogger.h @@ -0,0 +1,27 @@ +//===-- Utilities to log to standard output during tests --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H +#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H + +namespace LIBC_NAMESPACE { +namespace benchmarks { + +// A class to log to standard output in the context of hermetic tests. +struct BenchmarkLogger { + constexpr BenchmarkLogger() = default; + template <typename T> BenchmarkLogger &operator<<(T); +}; + +// A global TestLogger instance to be used in tests. +extern BenchmarkLogger log; + +} // namespace benchmarks +} // namespace LIBC_NAMESPACE + +#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt new file mode 100644 index 0000000..d167abc --- /dev/null +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -0,0 +1,56 @@ +add_subdirectory(timing) + +add_custom_target(gpu-benchmark) + +function(add_benchmark benchmark_name) + cmake_parse_arguments( + "BENCHMARK" + "" # Optional arguments + "" # Single value arguments + "LINK_LIBRARIES" # Multi-value arguments + ${ARGN} + ) + if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) + message(FATAL_ERROR "target does not support clock") + endif() + add_libc_hermetic( + ${benchmark_name} + IS_BENCHMARK + LINK_LIBRARIES + LibcGpuBenchmark.hermetic + ${BENCHMARK_LINK_LIBRARIES} + ${BENCHMARK_UNPARSED_ARGUMENTS} + ) + get_fq_target_name(${benchmark_name} fq_target_name) + add_dependencies(gpu-benchmark ${fq_target_name}) +endfunction(add_benchmark) + +add_unittest_framework_library( + LibcGpuBenchmark + SRCS + LibcGpuBenchmark.cpp + LibcGpuBenchmarkMain.cpp + BenchmarkLogger.cpp + HDRS + LibcGpuBenchmark.h + BenchmarkLogger.h + DEPENDS + libc.src.__support.big_int + libc.src.__support.c_string + libc.src.__support.CPP.string + libc.src.__support.CPP.string_view + libc.src.__support.CPP.type_traits + libc.src.__support.CPP.functional + libc.src.__support.CPP.limits + libc.src.__support.CPP.algorithm + libc.src.__support.fixed_point.fx_rep + libc.src.__support.macros.properties.types + libc.src.__support.OSUtil.osutil + libc.src.__support.uint128 + libc.src.__support.FPUtil.sqrt + libc.src.__support.fixedvector + libc.src.time.clock + libc.benchmarks.gpu.timing.timing +) + +add_subdirectory(src) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp new file mode 100644 index 0000000..69adb0c --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -0,0 +1,140 @@ +#include "LibcGpuBenchmark.h" +#include "src/__support/CPP/algorithm.h" +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/string.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/GPU/utils.h" +#include "src/__support/fixedvector.h" +#include "src/time/gpu/time_utils.h" + +namespace LIBC_NAMESPACE { +namespace benchmarks { + +FixedVector<Benchmark *, 64> benchmarks; +cpp::array<BenchmarkResult, 1024> results; + +void Benchmark::add_benchmark(Benchmark *benchmark) { + benchmarks.push_back(benchmark); +} + +BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) { + BenchmarkResult result; + uint64_t cycles_sum = 0; + double standard_deviation_sum = 0; + uint64_t min = UINT64_MAX; + uint64_t max = 0; + uint32_t samples_sum = 0; + uint32_t iterations_sum = 0; + clock_t time_sum = 0; + uint64_t num_threads = gpu::get_num_threads(); + for (uint64_t i = 0; i < num_threads; i++) { + BenchmarkResult current_result = results[i]; + cycles_sum += current_result.cycles; + standard_deviation_sum += current_result.standard_deviation; + min = cpp::min(min, current_result.min); + max = cpp::max(max, current_result.max); + samples_sum += current_result.samples; + iterations_sum += current_result.total_iterations; + time_sum += current_result.total_time; + } + result.cycles = cycles_sum / num_threads; + result.standard_deviation = standard_deviation_sum / num_threads; + result.min = min; + result.max = max; + result.samples = samples_sum / num_threads; + result.total_iterations = iterations_sum / num_threads; + result.total_time = time_sum / num_threads; + return result; +} + +void Benchmark::run_benchmarks() { + uint64_t id = gpu::get_thread_id(); + gpu::sync_threads(); + + for (Benchmark *benchmark : benchmarks) + results[id] = benchmark->run(); + gpu::sync_threads(); + if (id == 0) { + for (Benchmark *benchmark : benchmarks) { + BenchmarkResult all_results = reduce_results(results); + constexpr auto GREEN = "\033[32m"; + constexpr auto RESET = "\033[0m"; + log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n'; + log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": " + << all_results.cycles << " cycles, " << all_results.min << " min, " + << all_results.max << " max, " << all_results.total_iterations + << " iterations, " << all_results.total_time << " ns, " + << static_cast<long>(all_results.standard_deviation) << " stddev\n"; + } + } + gpu::sync_threads(); +} + +BenchmarkResult benchmark(const BenchmarkOptions &options, + cpp::function<uint64_t(void)> wrapper_func) { + BenchmarkResult result; + RuntimeEstimationProgression rep; + uint32_t total_iterations = 0; + uint32_t iterations = options.initial_iterations; + if (iterations < 1u) + iterations = 1; + + uint32_t samples = 0; + uint64_t total_time = 0; + uint64_t best_guess = 0; + uint64_t total_cycles = 0; + uint64_t cycles_squared = 0; + uint64_t min = UINT64_MAX; + uint64_t max = 0; + + uint64_t overhead = UINT64_MAX; + int overhead_iterations = 10; + for (int i = 0; i < overhead_iterations; i++) + overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); + + for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { + uint64_t sample_cycles = 0; + const clock_t start = static_cast<double>(clock()); + for (uint32_t i = 0; i < iterations; i++) { + auto wrapper_intermediate = wrapper_func(); + uint64_t result = wrapper_intermediate - overhead; + max = cpp::max(max, result); + min = cpp::min(min, result); + sample_cycles += result; + } + const clock_t end = clock(); + const clock_t duration_ns = + ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; + total_time += duration_ns; + time_budget -= duration_ns; + samples++; + total_cycles += sample_cycles; + cycles_squared += sample_cycles * sample_cycles; + + total_iterations += iterations; + const double change_ratio = + rep.compute_improvement({iterations, sample_cycles}); + best_guess = rep.current_estimation; + + if (samples >= options.max_samples || iterations >= options.max_iterations) + break; + if (total_time >= options.min_duration && samples >= options.min_samples && + change_ratio < options.epsilon) + break; + + iterations *= options.scaling_factor; + } + result.cycles = best_guess; + result.standard_deviation = fputil::sqrt<double>( + static_cast<double>(cycles_squared) / total_iterations - + static_cast<double>(best_guess * best_guess)); + result.min = min; + result.max = max; + result.samples = samples; + result.total_iterations = total_iterations; + result.total_time = total_time; + return result; +}; + +} // namespace benchmarks +} // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h new file mode 100644 index 0000000..59dd589 --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -0,0 +1,108 @@ +#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H +#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H + +#include "benchmarks/gpu/BenchmarkLogger.h" +#include "benchmarks/gpu/timing/timing.h" +#include "src/__support/CPP/functional.h" +#include "src/__support/CPP/limits.h" +#include "src/__support/CPP/string_view.h" +#include "src/time/clock.h" + +#include <stdint.h> + +namespace LIBC_NAMESPACE { + +namespace benchmarks { + +struct BenchmarkOptions { + uint32_t initial_iterations = 1; + uint32_t max_iterations = 10000000; + uint32_t min_samples = 4; + uint32_t max_samples = 1000; + uint64_t min_duration = 0; // in nanoseconds (ns) + uint64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second + double epsilon = 0.01; + double scaling_factor = 1.4; +}; + +struct Measurement { + uint32_t iterations = 0; + uint64_t elapsed_cycles = 0; +}; + +class RefinableRuntimeEstimation { + uint64_t total_cycles = 0; + uint32_t total_iterations = 0; + +public: + uint64_t update(const Measurement &M) { + total_cycles += M.elapsed_cycles; + total_iterations += M.iterations; + return total_cycles / total_iterations; + } +}; + +// Tracks the progression of the runtime estimation +class RuntimeEstimationProgression { + RefinableRuntimeEstimation rre; + +public: + uint64_t current_estimation = 0; + + double compute_improvement(const Measurement &M) { + const uint64_t new_estimation = rre.update(M); + double ratio = + (static_cast<double>(current_estimation) / new_estimation) - 1.0; + + // Get absolute value + if (ratio < 0) + ratio *= -1; + + current_estimation = new_estimation; + return ratio; + } +}; + +struct BenchmarkResult { + uint64_t cycles = 0; + double standard_deviation = 0; + uint64_t min = UINT64_MAX; + uint64_t max = 0; + uint32_t samples = 0; + uint32_t total_iterations = 0; + clock_t total_time = 0; +}; + +BenchmarkResult benchmark(const BenchmarkOptions &options, + cpp::function<uint64_t(void)> wrapper_func); + +class Benchmark { + const cpp::function<uint64_t(void)> func; + const cpp::string_view name; + +public: + Benchmark(cpp::function<uint64_t(void)> func, char const *name) + : func(func), name(name) { + add_benchmark(this); + } + + static void run_benchmarks(); + +protected: + static void add_benchmark(Benchmark *benchmark); + +private: + BenchmarkResult run() { + BenchmarkOptions options; + return benchmark(options, func); + } + const cpp::string_view get_name() const { return name; } +}; +} // namespace benchmarks +} // namespace LIBC_NAMESPACE + +#define BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName); + +#endif diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp new file mode 100644 index 0000000..97366e5 --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp @@ -0,0 +1,6 @@ +#include "LibcGpuBenchmark.h" + +extern "C" int main(int argc, char **argv, char **envp) { + LIBC_NAMESPACE::benchmarks::Benchmark::run_benchmarks(); + return 0; +} diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt new file mode 100644 index 0000000..42eb4f7 --- /dev/null +++ b/libc/benchmarks/gpu/src/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(ctype) diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt new file mode 100644 index 0000000..79f0142 --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -0,0 +1,21 @@ +add_custom_target(libc-gpu-ctype-benchmarks) + +add_benchmark( + isalnum_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalnum_benchmark.cpp + DEPENDS + libc.src.ctype.isalnum +) + +add_benchmark( + isalpha_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalpha_benchmark.cpp + DEPENDS + libc.src.ctype.isalpha +) diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp new file mode 100644 index 0000000..4050bc0 --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp @@ -0,0 +1,9 @@ +#include "benchmarks/gpu/LibcGpuBenchmark.h" + +#include "src/ctype/isalnum.h" + +uint64_t BM_IsAlnum() { + char x = 'c'; + return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x); +} +BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum); diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp new file mode 100644 index 0000000..2038eb8 --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp @@ -0,0 +1,9 @@ +#include "benchmarks/gpu/LibcGpuBenchmark.h" + +#include "src/ctype/isalpha.h" + +uint64_t BM_IsAlpha() { + char x = 'c'; + return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x); +} +BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha); diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt new file mode 100644 index 0000000..8bbc7e3 --- /dev/null +++ b/libc/benchmarks/gpu/timing/CMakeLists.txt @@ -0,0 +1,12 @@ +foreach(target nvptx) + add_subdirectory(${target}) + list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing) +endforeach() + +add_header_library( + timing + HDRS + timing.h + DEPENDS + ${target_gpu_timing} +) diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt new file mode 100644 index 0000000..9958e16 --- /dev/null +++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt @@ -0,0 +1,7 @@ +add_header_library( + nvptx_timing + HDRS + timing.h + DEPENDS + libc.src.__support.common +) diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h new file mode 100644 index 0000000..d3851a7 --- /dev/null +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -0,0 +1,99 @@ +//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX +#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX + +#include "src/__support/GPU/utils.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include <stdint.h> + +namespace LIBC_NAMESPACE { + +// Returns the overhead associated with calling the profiling region. This +// allows us to substract the constant-time overhead from the latency to +// obtain a true result. This can vary with system load. +[[gnu::noinline]] static uint64_t overhead() { + volatile uint32_t x = 1; + uint32_t y = x; + uint64_t start = gpu::processor_clock(); + asm("" ::"r"(y), "llr"(start)); + uint32_t result = y; + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + uint64_t stop = gpu::processor_clock(); + volatile auto storage = result; + return stop - start; +} + +// Stimulate a simple function and obtain its latency in clock cycles on the +// system. This function cannot be inlined or else it will disturb the very +// delicate balance of hard-coded dependencies. +template <typename F, typename T> +[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { + // We need to store the input somewhere to guarantee that the compiler will + // not constant propagate it and remove the profiling region. + volatile T storage = t; + T arg = storage; + asm("" ::"r"(arg)); + + // Get the current timestamp from the clock. + gpu::memory_fence(); + uint64_t start = gpu::processor_clock(); + + // This forces the compiler to load the input argument and run the clock cycle + // counter before the profiling region. + asm("" ::"r"(arg), "llr"(start)); + + // Run the function under test and return its value. + auto result = f(arg); + + // This inline assembly performs a no-op which forces the result to both be + // used and prevents us from exiting this region before it's complete. + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + + // Obtain the current timestamp after running the calculation and force + // ordering. + uint64_t stop = gpu::processor_clock(); + gpu::memory_fence(); + asm("" ::"r"(stop)); + volatile T output = result; + + // Return the time elapsed. + return stop - start; +} + +template <typename F, typename T1, typename T2> +static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { + volatile T1 storage = t1; + volatile T2 storage2 = t2; + T1 arg = storage; + T2 arg2 = storage2; + asm("" ::"r"(arg), "r"(arg2)); + + gpu::memory_fence(); + uint64_t start = gpu::processor_clock(); + + asm("" ::"r"(arg), "r"(arg2), "llr"(start)); + + auto result = f(arg, arg2); + + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + + uint64_t stop = gpu::processor_clock(); + gpu::memory_fence(); + asm("" ::"r"(stop)); + volatile auto output = result; + + return stop - start; +} +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h new file mode 100644 index 0000000..180ea77 --- /dev/null +++ b/libc/benchmarks/gpu/timing/timing.h @@ -0,0 +1,22 @@ +//===------------- Implementation of GPU timing utils -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H +#define LLVM_LIBC_UTILS_GPU_TIMING_H + +#include "src/__support/macros/properties/architectures.h" + +#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) +#error "amdgpu not yet supported" +#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) +#include "nvptx/timing.h" +#else +#error "unsupported platform" +#endif + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_H diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index c8d7c8a..fbeec32 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -526,12 +526,15 @@ function(add_integration_test test_name) add_dependencies(${INTEGRATION_TEST_SUITE} ${fq_target_name}) endfunction(add_integration_test) -# Rule to add a hermetic test. A hermetic test is one whose executable is fully +# Rule to add a hermetic program. A hermetic program is one whose executable is fully # statically linked and consists of pieces drawn only from LLVM's libc. Nothing, # including the startup objects, come from the system libc. # +# For the GPU, these can be either tests or benchmarks, depending on the value +# of the LINK_LIBRARIES arg. +# # Usage: -# add_libc_hermetic_test( +# add_libc_hermetic( # <target name> # SUITE <the suite to which the test should belong> # SRCS <src1.cpp> [src2.cpp ...] @@ -543,14 +546,14 @@ endfunction(add_integration_test) # LINK_LIBRARIES <list of linking libraries for this target> # LOADER_ARGS <list of special args to loaders (like the GPU loader)> # ) -function(add_libc_hermetic_test test_name) +function(add_libc_hermetic test_name) if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1) message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.") return() endif() cmake_parse_arguments( "HERMETIC_TEST" - "" # No optional arguments + "IS_BENCHMARK" # Optional arguments "SUITE" # Single value arguments "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments ${ARGN} @@ -678,7 +681,6 @@ function(add_libc_hermetic_test test_name) PRIVATE libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} ${link_libraries} - LibcTest.hermetic LibcHermeticTestSupport.hermetic # The NVIDIA 'nvlink' linker does not currently support static libraries. $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>) @@ -714,8 +716,12 @@ function(add_libc_hermetic_test test_name) ) add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name}) - add_dependencies(libc-hermetic-tests ${fq_target_name}) -endfunction(add_libc_hermetic_test) + if(NOT ${HERMETIC_TEST_IS_BENCHMARK}) + # If it is a benchmark, it will already have been added to the + # gpu-benchmark target + add_dependencies(libc-hermetic-tests ${fq_target_name}) + endif() +endfunction(add_libc_hermetic) # A convenience function to add both a unit test as well as a hermetic test. function(add_libc_test test_name) @@ -730,7 +736,12 @@ function(add_libc_test test_name) add_libc_unittest(${test_name}.__unit__ ${LIBC_TEST_UNPARSED_ARGUMENTS}) endif() if(LIBC_ENABLE_HERMETIC_TESTS AND NOT LIBC_TEST_UNIT_TEST_ONLY) - add_libc_hermetic_test(${test_name}.__hermetic__ ${LIBC_TEST_UNPARSED_ARGUMENTS}) + add_libc_hermetic( + ${test_name}.__hermetic__ + LINK_LIBRARIES + LibcTest.hermetic + ${LIBC_TEST_UNPARSED_ARGUMENTS} + ) get_fq_target_name(${test_name} fq_test_name) if(TARGET ${fq_test_name}.__hermetic__ AND TARGET ${fq_test_name}.__unit__) # Tests like the file tests perform file operations on disk file. If we |