diff options
author | Jon Roelofs <jonathan_roelofs@apple.com> | 2024-06-13 18:06:20 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-13 18:06:20 -0700 |
commit | 4f8c961924c2e15eed54e5207111ceedc1da6568 (patch) | |
tree | be64f6c730cfb6288263e67ba7d4d15860749ae9 /compiler-rt | |
parent | 785dc76c6667d0ea81c8b877dbff9c1e843918d6 (diff) | |
download | llvm-4f8c961924c2e15eed54e5207111ceedc1da6568.zip llvm-4f8c961924c2e15eed54e5207111ceedc1da6568.tar.gz llvm-4f8c961924c2e15eed54e5207111ceedc1da6568.tar.bz2 |
[compiler-rt][AArch64][FMV] Use the hw.optional.arm.caps fast path (#95275)
MacOS 15.0 and iOS 18.0 added a new sysctl to fetch a bitvector of all
the hw.optional.arm.FEAT_*'s in one go. Using this has a perf advantage
over doing multiple round-trips to the kernel and back, but since it's
not present in older oses, we still need the slow fallback.
Diffstat (limited to 'compiler-rt')
-rw-r--r-- | compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc | 77 |
1 files changed, 75 insertions, 2 deletions
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc index 6fef109..6f4b9ab 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc @@ -2,6 +2,11 @@ #if TARGET_OS_OSX || TARGET_OS_IPHONE #include <sys/sysctl.h> +#if __has_include(<arm/cpu_capabilities_public.h>) +#include <arm/cpu_capabilities_public.h> +#define HAS_CPU_CAPABILITIES_PUBLIC_H 1 +#endif + static bool isKnownAndSupported(const char *name) { int32_t val = 0; size_t size = sizeof(val); @@ -10,6 +15,19 @@ static bool isKnownAndSupported(const char *name) { return val; } +static uint64_t deriveImplicitFeatures(uint64_t features) { + // FEAT_SSBS2 implies FEAT_SSBS + if ((1ULL << FEAT_SSBS2) & features) + features |= (1ULL << FEAT_SSBS); + + // FEAT_FP is always enabled + features |= (1ULL << FEAT_FP); + + features |= (1ULL << FEAT_INIT); + + return features; +} + void __init_cpu_features_resolver(void) { // On Darwin platforms, this may be called concurrently by multiple threads // because the resolvers that use it are called lazily at runtime (unlike on @@ -21,6 +39,62 @@ void __init_cpu_features_resolver(void) { uint64_t features = 0; +#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H + uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0}; + size_t len = sizeof(feats_bitvec); + // When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the + // fast path to get all the feature bits, otherwise fall back to the slow + // ~20-something sysctls path. + if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) { + +#define CHECK_BIT(FROM, TO) \ + do { \ + if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) { \ + features |= (1ULL << TO); \ + } \ + } while (0) + + CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM); + CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2); + CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML); + CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD); + CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3); + CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM); + CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE); + CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2); + CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1); + CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES); + CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL); + CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES); + CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB); + CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS); + CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC); + CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2); + CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA); + CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT); + CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB); + CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2); + CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16); + CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM); + CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT); + CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16); + CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2); + CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI); + CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD); + CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC); + CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME); + CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2); + CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64); + CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64); + + features = deriveImplicitFeatures(features); + + __atomic_store(&__aarch64_cpu_features.features, &features, + __ATOMIC_RELAXED); + return; + } +#endif + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics static const struct { const char *sysctl_name; @@ -32,7 +106,6 @@ void __init_cpu_features_resolver(void) { {"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD}, {"hw.optional.arm.FEAT_RDM", FEAT_RDM}, {"hw.optional.arm.FEAT_LSE", FEAT_LSE}, - {"hw.optional.floatingpoint", FEAT_FP}, {"hw.optional.AdvSIMD", FEAT_SIMD}, {"hw.optional.armv8_crc32", FEAT_CRC}, {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1}, @@ -62,7 +135,7 @@ void __init_cpu_features_resolver(void) { if (isKnownAndSupported(feature_checks[I].sysctl_name)) features |= (1ULL << feature_checks[I].feature); - features |= (1ULL << FEAT_INIT); + features = deriveImplicitFeatures(features); __atomic_store(&__aarch64_cpu_features.features, &features, __ATOMIC_RELAXED); |