#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #include "benchmarks/gpu/BenchmarkLogger.h" #include "benchmarks/gpu/timing/timing.h" #include "hdr/stdint_proxy.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/functional.h" #include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/macros/config.h" #include "src/stdlib/rand.h" #include "src/time/clock.h" namespace LIBC_NAMESPACE_DECL { namespace benchmarks { struct BenchmarkOptions { uint32_t initial_iterations = 1; uint32_t min_iterations = 1; uint32_t max_iterations = 10000000; uint32_t min_samples = 4; uint32_t max_samples = 1000; int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second double epsilon = 0.0001; double scaling_factor = 1.4; }; struct Measurement { uint32_t iterations = 0; uint64_t elapsed_cycles = 0; }; class RefinableRuntimeEstimation { uint64_t total_cycles = 0; uint32_t total_iterations = 0; public: uint64_t update(const Measurement &M) { total_cycles += M.elapsed_cycles; total_iterations += M.iterations; return total_cycles / total_iterations; } }; // Tracks the progression of the runtime estimation class RuntimeEstimationProgression { RefinableRuntimeEstimation rre; public: uint64_t current_estimation = 0; double compute_improvement(const Measurement &M) { const uint64_t new_estimation = rre.update(M); double ratio = (static_cast(current_estimation) / new_estimation) - 1.0; // Get absolute value if (ratio < 0) ratio *= -1; current_estimation = new_estimation; return ratio; } }; struct BenchmarkResult { uint64_t cycles = 0; double standard_deviation = 0; uint64_t min = UINT64_MAX; uint64_t max = 0; uint32_t samples = 0; uint32_t total_iterations = 0; clock_t total_time = 0; }; BenchmarkResult benchmark(const BenchmarkOptions &options, cpp::function wrapper_func); class Benchmark { const cpp::function func; const cpp::string_view suite_name; const cpp::string_view test_name; const uint32_t num_threads; public: Benchmark(cpp::function func, char const *suite_name, char const *test_name, uint32_t num_threads) : func(func), suite_name(suite_name), test_name(test_name), num_threads(num_threads) { add_benchmark(this); } static void run_benchmarks(); const cpp::string_view get_suite_name() const { return suite_name; } const cpp::string_view get_test_name() const { return test_name; } protected: static void add_benchmark(Benchmark *benchmark); private: BenchmarkResult run() { BenchmarkOptions options; return benchmark(options, func); } }; // We want our random values to be approximately // Output: a random number with the exponent field between min_exp and max_exp, // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), // Caveats: // -EXP_BIAS corresponding to denormal values, // EXP_BIAS + 1 corresponding to inf or nan. template static T get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS, int min_exp = -LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS) { using FPBits = LIBC_NAMESPACE::fputil::FPBits; // Required to correctly instantiate FPBits for floats and doubles. using RandType = typename cpp::conditional_t<(cpp::is_same_v), uint64_t, uint32_t>; RandType bits; if constexpr (cpp::is_same_v) bits = (static_cast(LIBC_NAMESPACE::rand()) << 32) | static_cast(LIBC_NAMESPACE::rand()); else bits = LIBC_NAMESPACE::rand(); double scale = static_cast(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); FPBits fp(bits); fp.set_biased_exponent( static_cast(fp.get_biased_exponent() * scale + min_exp)); return fp.get_val(); } template class MathPerf { using FPBits = fputil::FPBits; using StorageType = typename FPBits::StorageType; static constexpr StorageType UIntMax = cpp::numeric_limits::max(); public: template static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { cpp::array inputs; for (size_t i = 0; i < N; ++i) inputs[i] = get_rand_input(min_exp, max_exp); uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); return total_time / N; } // Throughput benchmarking for functions that take 2 inputs. template static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, int arg1_max_exp, int arg2_min_exp, int arg2_max_exp) { cpp::array inputs1; cpp::array inputs2; for (size_t i = 0; i < N; ++i) { inputs1[i] = get_rand_input(arg1_min_exp, arg1_max_exp); inputs2[i] = get_rand_input(arg2_min_exp, arg2_max_exp); } uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); return total_time / N; } }; } // namespace benchmarks } // namespace LIBC_NAMESPACE_DECL // Passing -1 indicates the benchmark should be run with as many threads as // allocated by the user in the benchmark's CMake. #define BENCHMARK(SuiteName, TestName, Func) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ Func, #SuiteName, #TestName, -1) #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ Func, #SuiteName, #TestName, NumThreads) #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ LIBC_NAMESPACE::gpu::get_lane_size()) #endif