diff options
author | bors[bot] <26634292+bors[bot]@users.noreply.github.com> | 2021-10-18 09:24:56 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-18 09:24:56 +0000 |
commit | 649e3e074bf8306bf0eb042f10483dbd61cd040b (patch) | |
tree | ae3b4402a9e8fdf23dfe27df0823219bd2064cae | |
parent | a1a450641004c45b78b76034161f7b2efb0eeb1f (diff) | |
parent | fd9d37c68ca363503ef5a515c7e409a3b15b43e1 (diff) | |
download | gcc-649e3e074bf8306bf0eb042f10483dbd61cd040b.zip gcc-649e3e074bf8306bf0eb042f10483dbd61cd040b.tar.gz gcc-649e3e074bf8306bf0eb042f10483dbd61cd040b.tar.bz2 |
Merge #747
747: Base v0 mangling grammar r=philberty a=CohenArthur
This PR adds base functions to deal with the v0 mangling grammar, [found here](https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html#syntax-of-mangled-names).
I have a few questions regarding this implementation:
1/ Is there any existing implementation for the base62 algorithm used here? This is directly adapted from [rustc's base_n module](https://github.com/rust-lang/rust/blob/6f53ddfa74ac3c10ceb63ad4a7a9c95e55853c87/compiler/rustc_data_structures/src/base_n.rs#L16) which I'm assuming is relatively standard and might already exist in the compiler. I haven't been able to find it however.
2/ gccrs cannot yet deal with unicode identifiers, as pointed out by `@bjorn3` in #418. This means that a big chunk of the `v0_add_identifier` implementation is missing. Should it be added in this PR too?
3/ As mentionned in zulip, it would be great to be able to create unit tests for this piece of code. It would be quite easy to generate a bunch of base62 strings and ensure that the algorithm here matches with them.
Co-authored-by: CohenArthur <arthur.cohen@epita.fr>
-rw-r--r-- | gcc/rust/Make-lang.in | 1 | ||||
-rw-r--r-- | gcc/rust/backend/rust-mangle.cc | 65 | ||||
-rw-r--r-- | gcc/rust/util/rust-base62.cc | 48 | ||||
-rw-r--r-- | gcc/rust/util/rust-base62.h | 34 |
4 files changed, 148 insertions, 0 deletions
diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index 0e181a6..57e8299 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -88,6 +88,7 @@ GRS_OBJS = \ rust/rust-lint-marklive.o \ rust/rust-hir-type-check-path.o \ rust/rust-compile-intrinsic.o \ + rust/rust-base62.o \ $(END) # removed object files from here diff --git a/gcc/rust/backend/rust-mangle.cc b/gcc/rust/backend/rust-mangle.cc index 0e6643c..15ac3b1 100644 --- a/gcc/rust/backend/rust-mangle.cc +++ b/gcc/rust/backend/rust-mangle.cc @@ -1,5 +1,7 @@ #include "rust-mangle.h" #include "fnv-hash.h" +#include "rust-base62.h" +#include <algorithm> // FIXME: Rename those to legacy_* static const std::string kMangledSymbolPrefix = "_ZN"; @@ -154,6 +156,63 @@ v0_simple_type_prefix (const TyTy::BaseType *ty) gcc_unreachable (); } +// Add an underscore-terminated base62 integer to the mangling string. +// This corresponds to the `<base-62-number>` grammar in the v0 mangling RFC: +// - 0 is encoded as "_" +// - any other value is encoded as itself minus one in base 62, followed by "_" +static void +v0_add_integer_62 (std::string &mangled, uint64_t x) +{ + if (x > 0) + mangled.append (base62_integer (x - 1)); + + mangled.append ("_"); +} + +// Add a tag-prefixed base62 integer to the mangling string when the +// integer is greater than 0: +// - 0 is encoded as "" (nothing) +// - any other value is encoded as <tag> + v0_add_integer_62(itself), that is +// <tag> + base62(itself - 1) + '_' +static void +v0_add_opt_integer_62 (std::string &mangled, std::string tag, uint64_t x) +{ + if (x > 0) + { + mangled.append (tag); + v0_add_integer_62 (mangled, x); + } +} + +static void +v0_add_disambiguator (std::string &mangled, uint64_t dis) +{ + v0_add_opt_integer_62 (mangled, "s", dis); +} + +// Add an identifier to the mangled string. This corresponds to the +// `<identifier>` grammar in the v0 mangling RFC. +static void +v0_add_identifier (std::string &mangled, const std::string &identifier) +{ + // FIXME: gccrs cannot handle unicode identifiers yet, so we never have to + // create mangling for unicode values for now. However, this is handled + // by the v0 mangling scheme. The grammar for unicode identifier is contained + // in <undisambiguated-identifier>, right under the <identifier> one. If the + // identifier contains unicode values, then an extra "u" needs to be added + // to the mangling string and `punycode` must be used to encode the + // characters. + + mangled += std::to_string (identifier.size ()); + + // If the first character of the identifier is a digit or an underscore, we + // add an extra underscore + if (identifier[0] == '_') + mangled.append ("_"); + + mangled.append (identifier); +} + static std::string v0_type_prefix (const TyTy::BaseType *ty) { @@ -194,7 +253,13 @@ static std::string v0_mangle_item (const TyTy::BaseType *ty, const Resolver::CanonicalPath &path, const std::string &crate_name) { + std::string mangled; + + // FIXME: Add real algorithm once all pieces are implemented auto ty_prefix = v0_type_prefix (ty); + v0_add_identifier (mangled, crate_name); + v0_add_disambiguator (mangled, 62); + gcc_unreachable (); } diff --git a/gcc/rust/util/rust-base62.cc b/gcc/rust/util/rust-base62.cc new file mode 100644 index 0000000..f1e3202 --- /dev/null +++ b/gcc/rust/util/rust-base62.cc @@ -0,0 +1,48 @@ +// Copyright (C) 2020 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +#include "rust-base62.h" + +#include <algorithm> + +namespace Rust { + +std::string +base62_integer (uint64_t value) +{ + const static std::string base_64 + = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@$"; + std::string buffer (128, '\0'); + size_t idx = 0; + size_t base = 62; + + do + { + buffer[idx] = base_64[(value % base)]; + idx++; + value = value / base; + } + while (value != 0); + + std::reverse (buffer.begin (), buffer.begin () + idx); + return buffer.substr (0, idx); +} + +} // namespace Rust + +// FIXME: Add unit testing using the selftest framework diff --git a/gcc/rust/util/rust-base62.h b/gcc/rust/util/rust-base62.h new file mode 100644 index 0000000..7a6e3cf --- /dev/null +++ b/gcc/rust/util/rust-base62.h @@ -0,0 +1,34 @@ +// Copyright (C) 2020 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +#ifndef RUST_BASE62_H +#define RUST_BASE62_H + +#include <string> + +namespace Rust { + +/** + * Get the Base62 representation of an integer + */ +std::string +base62_integer (uint64_t value); + +} // namespace Rust + +#endif /* !RUST_BASE62_H */ |