#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #include "benchmarks/gpu/Random.h" #include "benchmarks/gpu/timing/timing.h" #include "hdr/stdint_proxy.h" #include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/string_view.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/sqrt.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { namespace benchmarks { struct BenchmarkOptions { uint32_t initial_iterations = 1; uint32_t min_iterations = 1; uint32_t max_iterations = 10000000; uint32_t min_samples = 4; uint32_t max_samples = 1000; int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second double epsilon = 0.0001; double scaling_factor = 1.4; }; class RefinableRuntimeEstimator { uint32_t iterations = 0; uint64_t sum_of_cycles = 0; uint64_t sum_of_squared_cycles = 0; public: void update(uint64_t cycles) noexcept { iterations += 1; sum_of_cycles += cycles; sum_of_squared_cycles += cycles * cycles; } void update(const RefinableRuntimeEstimator &other) noexcept { iterations += other.iterations; sum_of_cycles += other.sum_of_cycles; sum_of_squared_cycles += other.sum_of_squared_cycles; } double get_mean() const noexcept { if (iterations == 0) return 0.0; return static_cast(sum_of_cycles) / iterations; } double get_variance() const noexcept { if (iterations == 0) return 0.0; const double num = static_cast(iterations); const double sum_x = static_cast(sum_of_cycles); const double sum_x2 = static_cast(sum_of_squared_cycles); const double mean_of_squares = sum_x2 / num; const double mean = sum_x / num; const double mean_squared = mean * mean; const double variance = mean_of_squares - mean_squared; return variance < 0.0 ? 0.0 : variance; } double get_stddev() const noexcept { return fputil::sqrt(get_variance()); } uint32_t get_iterations() const noexcept { return iterations; } }; // Tracks the progression of the runtime estimation class RuntimeEstimationProgression { RefinableRuntimeEstimator estimator; double current_mean = 0.0; public: const RefinableRuntimeEstimator &get_estimator() const noexcept { return estimator; } double compute_improvement(const RefinableRuntimeEstimator &sample_estimator) { if (sample_estimator.get_iterations() == 0) return 1.0; estimator.update(sample_estimator); const double new_mean = estimator.get_mean(); if (current_mean == 0.0 || new_mean == 0.0) { current_mean = new_mean; return 1.0; } double ratio = (current_mean / new_mean) - 1.0; if (ratio < 0) ratio = -ratio; current_mean = new_mean; return ratio; } }; struct BenchmarkResult { uint64_t total_iterations = 0; double cycles = 0; double standard_deviation = 0; uint64_t min = UINT64_MAX; uint64_t max = 0; }; struct BenchmarkTarget { using IndexedFnPtr = uint64_t (*)(uint32_t); using IndexlessFnPtr = uint64_t (*)(); enum class Kind : uint8_t { Indexed, Indexless } kind; union { IndexedFnPtr indexed_fn_ptr; IndexlessFnPtr indexless_fn_ptr; }; LIBC_INLINE BenchmarkTarget(IndexedFnPtr func) : kind(Kind::Indexed), indexed_fn_ptr(func) {} LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func) : kind(Kind::Indexless), indexless_fn_ptr(func) {} LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const { return kind == Kind::Indexed ? indexed_fn_ptr(call_index) : indexless_fn_ptr(); } }; BenchmarkResult benchmark(const BenchmarkOptions &options, const BenchmarkTarget &target); class Benchmark { const BenchmarkTarget target; const cpp::string_view suite_name; const cpp::string_view test_name; const uint32_t num_threads; public: Benchmark(uint64_t (*f)(), const char *suite, const char *test, uint32_t threads) : target(BenchmarkTarget(f)), suite_name(suite), test_name(test), num_threads(threads) { add_benchmark(this); } Benchmark(uint64_t (*f)(uint32_t), char const *suite_name, char const *test_name, uint32_t num_threads) : target(BenchmarkTarget(f)), suite_name(suite_name), test_name(test_name), num_threads(num_threads) { add_benchmark(this); } static void run_benchmarks(); const cpp::string_view get_suite_name() const { return suite_name; } const cpp::string_view get_test_name() const { return test_name; } protected: static void add_benchmark(Benchmark *benchmark); private: BenchmarkResult run() { BenchmarkOptions options; return benchmark(options, target); } }; template class MathPerf { static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) { const uint64_t tid = gpu::get_thread_id(); return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL); } public: // Returns cycles-per-call (lower is better) template static uint64_t run_throughput(T (*f)(T), const Dist &dist, uint32_t call_index) { cpp::array inputs; uint64_t base_seed = static_cast(call_index); uint64_t salt = static_cast(N); RandomGenerator rng(make_seed(base_seed, salt)); for (size_t i = 0; i < N; ++i) inputs[i] = dist(rng); uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); return total_time / N; } // Returns cycles-per-call (lower is better) template static uint64_t run_throughput(T (*f)(T, T), const Dist1 &dist1, const Dist2 &dist2, uint32_t call_index) { cpp::array inputs1; cpp::array inputs2; uint64_t base_seed = static_cast(call_index); uint64_t salt = static_cast(N); RandomGenerator rng(make_seed(base_seed, salt)); for (size_t i = 0; i < N; ++i) { inputs1[i] = dist1(rng); inputs2[i] = dist2(rng); } uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); return total_time / N; } }; } // namespace benchmarks } // namespace LIBC_NAMESPACE_DECL // Passing -1 indicates the benchmark should be run with as many threads as // allocated by the user in the benchmark's CMake. #define BENCHMARK(SuiteName, TestName, Func) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ Func, #SuiteName, #TestName, -1) #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ Func, #SuiteName, #TestName, NumThreads) #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ LIBC_NAMESPACE::gpu::get_lane_size()) #endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H