aboutsummaryrefslogtreecommitdiff
path: root/libiberty/rust-demangle.c
diff options
context:
space:
mode:
authorEduard-Mihai Burtescu <eddyb@lyken.rs>2019-11-16 16:32:50 +0100
committerJeff Law <law@gcc.gnu.org>2019-11-16 08:32:50 -0700
commit32fc3719e06899d43e2298ad6d0028efe5ec3024 (patch)
tree6e8ed402acee2f31567b1089bbf6877a76127df3 /libiberty/rust-demangle.c
parentf73cb38f6530432ba15abf4bb6a58188479f1bc3 (diff)
downloadgcc-32fc3719e06899d43e2298ad6d0028efe5ec3024.zip
gcc-32fc3719e06899d43e2298ad6d0028efe5ec3024.tar.gz
gcc-32fc3719e06899d43e2298ad6d0028efe5ec3024.tar.bz2
[PATCH] Refactor rust-demangle to be independent of C++ demangling.
* demangle.h (rust_demangle_callback): Add. * cplus-dem.c (cplus_demangle): Use rust_demangle directly. (rust_demangle): Remove. * rust-demangle.c (is_prefixed_hash): Rename to is_legacy_prefixed_hash. (parse_lower_hex_nibble): Rename to decode_lower_hex_nibble. (parse_legacy_escape): Rename to decode_legacy_escape. (rust_is_mangled): Remove. (struct rust_demangler): Add. (peek): Add. (next): Add. (struct rust_mangled_ident): Add. (parse_ident): Add. (rust_demangle_sym): Remove. (print_str): Add. (PRINT): Add. (print_ident): Add. (rust_demangle_callback): Add. (struct str_buf): Add. (str_buf_reserve): Add. (str_buf_append): Add. (str_buf_demangle_callback): Add. (rust_demangle): Add. * rust-demangle.h: Remove. From-SVN: r278358
Diffstat (limited to 'libiberty/rust-demangle.c')
-rw-r--r--libiberty/rust-demangle.c572
1 files changed, 391 insertions, 181 deletions
diff --git a/libiberty/rust-demangle.c b/libiberty/rust-demangle.c
index 6b62e6d..fa9d472 100644
--- a/libiberty/rust-demangle.c
+++ b/libiberty/rust-demangle.c
@@ -33,9 +33,11 @@ If not, see <http://www.gnu.org/licenses/>. */
#include "safe-ctype.h"
+#include <inttypes.h>
#include <sys/types.h>
#include <string.h>
#include <stdio.h>
+#include <stdlib.h>
#ifdef HAVE_STRING_H
#include <string.h>
@@ -47,207 +49,110 @@ extern void *memset(void *s, int c, size_t n);
#include <demangle.h>
#include "libiberty.h"
-#include "rust-demangle.h"
+struct rust_demangler
+{
+ const char *sym;
+ size_t sym_len;
-/* Mangled (legacy) Rust symbols look like this:
- _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
-
- The original symbol is:
- <std::sys::fd::FileDesc as core::ops::Drop>::drop
-
- The last component of the path is a 64-bit hash in lowercase hex,
- prefixed with "h". Rust does not have a global namespace between
- crates, an illusion which Rust maintains by using the hash to
- distinguish things that would otherwise have the same symbol.
-
- Any path component not starting with a XID_Start character is
- prefixed with "_".
-
- The following escape sequences are used:
-
- "," => $C$
- "@" => $SP$
- "*" => $BP$
- "&" => $RF$
- "<" => $LT$
- ">" => $GT$
- "(" => $LP$
- ")" => $RP$
- "\u{XY}" => $uXY$
-
- A double ".." means "::" and a single "." means "-".
-
- The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */
-
-static const char *hash_prefix = "::h";
-static const size_t hash_prefix_len = 3;
-static const size_t hash_len = 16;
-
-static int is_prefixed_hash (const char *start);
-static int parse_lower_hex_nibble (char nibble);
-static char parse_legacy_escape (const char **in);
+ void *callback_opaque;
+ demangle_callbackref callback;
-/* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
+ /* Position of the next character to read from the symbol. */
+ size_t next;
- This function looks for the following indicators:
+ /* Non-zero if any error occurred. */
+ int errored;
- 1. The hash must consist of "h" followed by 16 lowercase hex digits.
+ /* Non-zero if printing should be verbose (e.g. include hashes). */
+ int verbose;
- 2. As a sanity check, the hash must use between 5 and 15 of the 16
- possible hex digits. This is true of 99.9998% of hashes so once
- in your life you may see a false negative. The point is to
- notice path components that could be Rust hashes but are
- probably not, like "haaaaaaaaaaaaaaaa". In this case a false
- positive (non-Rust symbol has an important path component
- removed because it looks like a Rust hash) is worse than a false
- negative (the rare Rust symbol is not demangled) so this sets
- the balance in favor of false negatives.
+ /* Rust mangling version, with legacy mangling being -1. */
+ int version;
+};
- 3. There must be no characters other than a-zA-Z0-9 and _.:$ */
+/* Parsing functions. */
-int
-rust_is_mangled (const char *sym)
+static char
+peek (const struct rust_demangler *rdm)
{
- size_t len, len_without_hash;
- const char *end;
+ if (rdm->next < rdm->sym_len)
+ return rdm->sym[rdm->next];
+ return 0;
+}
- if (!sym)
- return 0;
+static char
+next (struct rust_demangler *rdm)
+{
+ char c = peek (rdm);
+ if (!c)
+ rdm->errored = 1;
+ else
+ rdm->next++;
+ return c;
+}
- len = strlen (sym);
- if (len <= hash_prefix_len + hash_len)
- /* Not long enough to contain "::h" + hash + something else */
- return 0;
+struct rust_mangled_ident
+{
+ /* ASCII part of the identifier. */
+ const char *ascii;
+ size_t ascii_len;
+};
- len_without_hash = len - (hash_prefix_len + hash_len);
- if (!is_prefixed_hash (sym + len_without_hash))
- return 0;
+static struct rust_mangled_ident
+parse_ident (struct rust_demangler *rdm)
+{
+ char c;
+ size_t start, len;
+ struct rust_mangled_ident ident;
- end = sym + len_without_hash;
+ ident.ascii = NULL;
+ ident.ascii_len = 0;
- while (sym < end)
+ c = next (rdm);
+ if (!ISDIGIT (c))
{
- if (*sym == '$' || *sym == '.' || *sym == '_' || *sym == ':'
- || ISALNUM (*sym))
- sym++;
- else
- return 0;
+ rdm->errored = 1;
+ return ident;
}
+ len = c - '0';
- return 1;
-}
-
-/* A hash is the prefix "::h" followed by 16 lowercase hex digits. The
- hex digits must contain at least 5 distinct digits. */
-
-static int
-is_prefixed_hash (const char *str)
-{
- const char *end;
- char seen[16];
- size_t i;
- int count, nibble;
-
- if (strncmp (str, hash_prefix, hash_prefix_len))
- return 0;
- str += hash_prefix_len;
+ if (c != '0')
+ while (ISDIGIT (peek (rdm)))
+ len = len * 10 + (next (rdm) - '0');
- memset (seen, 0, sizeof(seen));
- for (end = str + hash_len; str < end; str++)
+ start = rdm->next;
+ rdm->next += len;
+ /* Check for overflows. */
+ if ((start > rdm->next) || (rdm->next > rdm->sym_len))
{
- nibble = parse_lower_hex_nibble (*str);
- if (nibble < 0)
- return 0;
- seen[nibble] = 1;
+ rdm->errored = 1;
+ return ident;
}
- /* Count how many distinct digits seen */
- count = 0;
- for (i = 0; i < 16; i++)
- if (seen[i])
- count++;
+ ident.ascii = rdm->sym + start;
+ ident.ascii_len = len;
- return count >= 5;
-}
+ if (ident.ascii_len == 0)
+ ident.ascii = NULL;
-/*
- INPUT: sym: symbol for which rust_is_mangled(sym) returned 1.
+ return ident;
+}
- The input is demangled in-place because the mangled name is always
- longer than the demangled one. */
+/* Printing functions. */
-void
-rust_demangle_sym (char *sym)
+static void
+print_str (struct rust_demangler *rdm, const char *data, size_t len)
{
- const char *in;
- char *out;
- const char *end;
- char unescaped;
-
- if (!sym)
- return;
-
- in = sym;
- out = sym;
- end = sym + strlen (sym) - (hash_prefix_len + hash_len);
-
- while (in < end)
- {
- if (*in == '$')
- {
- unescaped = parse_legacy_escape (&in);
- if (unescaped)
- *out++ = unescaped;
- else
- /* unexpected escape sequence, skip the rest of this segment. */
- while (in < end && *in != ':')
- *out++ = *in++;
- }
- else if (*in == '_')
- {
- /* If this is the start of a path component and the next
- character is an escape sequence, ignore the underscore. The
- mangler inserts an underscore to make sure the path
- component begins with a XID_Start character. */
- if ((in == sym || in[-1] == ':') && in[1] == '$')
- in++;
- else
- *out++ = *in++;
- }
- else if (*in == '.')
- {
- if (in[1] == '.')
- {
- /* ".." becomes "::" */
- *out++ = ':';
- *out++ = ':';
- in += 2;
- }
- else
- {
- /* "." becomes "-" */
- *out++ = '-';
- in++;
- }
- }
- else if (*in == ':' || ISALNUM (*in))
- *out++ = *in++;
- else
- {
- /* unexpected character in symbol, not rust_is_mangled. */
- *out++ = '?'; /* This is pretty lame, but it's hard to do better. */
- *out = '\0';
- return;
- }
- }
-
- *out = '\0';
+ if (!rdm->errored)
+ rdm->callback (data, len, rdm->callback_opaque);
}
+#define PRINT(s) print_str (rdm, s, strlen (s))
+
/* Return a 0x0-0xf value if the char is 0-9a-f, and -1 otherwise. */
static int
-parse_lower_hex_nibble (char nibble)
+decode_lower_hex_nibble (char nibble)
{
if ('0' <= nibble && nibble <= '9')
return nibble - '0';
@@ -258,17 +163,17 @@ parse_lower_hex_nibble (char nibble)
/* Return the unescaped character for a "$...$" escape, or 0 if invalid. */
static char
-parse_legacy_escape (const char **in)
+decode_legacy_escape (const char *e, size_t len, size_t *out_len)
{
char c = 0;
- const char *e;
size_t escape_len = 0;
int lo_nibble = -1, hi_nibble = -1;
- if ((*in)[0] != '$')
+ if (len < 3 || e[0] != '$')
return 0;
- e = *in + 1;
+ e++;
+ len--;
if (e[0] == 'C')
{
@@ -276,7 +181,7 @@ parse_legacy_escape (const char **in)
c = ',';
}
- else
+ else if (len > 2)
{
escape_len = 2;
@@ -294,14 +199,14 @@ parse_legacy_escape (const char **in)
c = '(';
else if (e[0] == 'R' && e[1] == 'P')
c = ')';
- else if (e[0] == 'u')
+ else if (e[0] == 'u' && len > 3)
{
escape_len = 3;
- hi_nibble = parse_lower_hex_nibble (e[1]);
+ hi_nibble = decode_lower_hex_nibble (e[1]);
if (hi_nibble < 0)
return 0;
- lo_nibble = parse_lower_hex_nibble (e[2]);
+ lo_nibble = decode_lower_hex_nibble (e[2]);
if (lo_nibble < 0)
return 0;
@@ -314,9 +219,314 @@ parse_legacy_escape (const char **in)
}
}
- if (!c || e[escape_len] != '$')
+ if (!c || len <= escape_len || e[escape_len] != '$')
return 0;
- *in += 2 + escape_len;
+ *out_len = 2 + escape_len;
return c;
}
+
+static void
+print_ident (struct rust_demangler *rdm, struct rust_mangled_ident ident)
+{
+ char unescaped;
+ size_t len;
+
+ if (rdm->errored)
+ return;
+
+ if (rdm->version == -1)
+ {
+ /* Ignore leading underscores preceding escape sequences.
+ The mangler inserts an underscore to make sure the
+ identifier begins with a XID_Start character. */
+ if (ident.ascii_len >= 2 && ident.ascii[0] == '_'
+ && ident.ascii[1] == '$')
+ {
+ ident.ascii++;
+ ident.ascii_len--;
+ }
+
+ while (ident.ascii_len > 0)
+ {
+ /* Handle legacy escape sequences ("$...$", ".." or "."). */
+ if (ident.ascii[0] == '$')
+ {
+ unescaped
+ = decode_legacy_escape (ident.ascii, ident.ascii_len, &len);
+ if (unescaped)
+ print_str (rdm, &unescaped, 1);
+ else
+ {
+ /* Unexpected escape sequence, print the rest verbatim. */
+ print_str (rdm, ident.ascii, ident.ascii_len);
+ return;
+ }
+ }
+ else if (ident.ascii[0] == '.')
+ {
+ if (ident.ascii_len >= 2 && ident.ascii[1] == '.')
+ {
+ /* ".." becomes "::" */
+ PRINT ("::");
+ len = 2;
+ }
+ else
+ {
+ /* "." becomes "-" */
+ PRINT ("-");
+ len = 1;
+ }
+ }
+ else
+ {
+ /* Print everything before the next escape sequence, at once. */
+ for (len = 0; len < ident.ascii_len; len++)
+ if (ident.ascii[len] == '$' || ident.ascii[len] == '.')
+ break;
+
+ print_str (rdm, ident.ascii, len);
+ }
+
+ ident.ascii += len;
+ ident.ascii_len -= len;
+ }
+
+ return;
+ }
+}
+
+/* A legacy hash is the prefix "h" followed by 16 lowercase hex digits.
+ The hex digits must contain at least 5 distinct digits. */
+static int
+is_legacy_prefixed_hash (struct rust_mangled_ident ident)
+{
+ uint16_t seen;
+ int nibble;
+ size_t i, count;
+
+ if (ident.ascii_len != 17 || ident.ascii[0] != 'h')
+ return 0;
+
+ seen = 0;
+ for (i = 0; i < 16; i++)
+ {
+ nibble = decode_lower_hex_nibble (ident.ascii[1 + i]);
+ if (nibble < 0)
+ return 0;
+ seen |= (uint16_t)1 << nibble;
+ }
+
+ /* Count how many distinct digits were seen. */
+ count = 0;
+ while (seen)
+ {
+ if (seen & 1)
+ count++;
+ seen >>= 1;
+ }
+
+ return count >= 5;
+}
+
+int
+rust_demangle_callback (const char *mangled, int options,
+ demangle_callbackref callback, void *opaque)
+{
+ const char *p;
+ struct rust_demangler rdm;
+ struct rust_mangled_ident ident;
+
+ rdm.sym = mangled;
+ rdm.sym_len = 0;
+
+ rdm.callback_opaque = opaque;
+ rdm.callback = callback;
+
+ rdm.next = 0;
+ rdm.errored = 0;
+ rdm.verbose = (options & DMGL_VERBOSE) != 0;
+ rdm.version = 0;
+
+ /* Rust symbols always start with _ZN (legacy). */
+ if (rdm.sym[0] == '_' && rdm.sym[1] == 'Z' && rdm.sym[2] == 'N')
+ {
+ rdm.sym += 3;
+ rdm.version = -1;
+ }
+ else
+ return 0;
+
+ /* Legacy Rust symbols use only [_0-9a-zA-Z.:$] characters. */
+ for (p = rdm.sym; *p; p++)
+ {
+ rdm.sym_len++;
+
+ if (*p == '_' || ISALNUM (*p))
+ continue;
+
+ if (rdm.version == -1 && (*p == '$' || *p == '.' || *p == ':'))
+ continue;
+
+ return 0;
+ }
+
+ /* Legacy Rust symbols need to be handled separately. */
+ if (rdm.version == -1)
+ {
+ /* Legacy Rust symbols always end with E. */
+ if (!(rdm.sym_len > 0 && rdm.sym[rdm.sym_len - 1] == 'E'))
+ return 0;
+ rdm.sym_len--;
+
+ /* Legacy Rust symbols also always end with a path segment
+ that encodes a 16 hex digit hash, i.e. '17h[a-f0-9]{16}'.
+ This early check, before any parse_ident calls, should
+ quickly filter out most C++ symbols unrelated to Rust. */
+ if (!(rdm.sym_len > 19
+ && !memcmp (&rdm.sym[rdm.sym_len - 19], "17h", 3)))
+ return 0;
+
+ do
+ {
+ ident = parse_ident (&rdm);
+ if (rdm.errored || !ident.ascii)
+ return 0;
+ }
+ while (rdm.next < rdm.sym_len);
+
+ /* The last path segment should be the hash. */
+ if (!is_legacy_prefixed_hash (ident))
+ return 0;
+
+ /* Reset the state for a second pass, to print the symbol. */
+ rdm.next = 0;
+ if (!rdm.verbose && rdm.sym_len > 19)
+ {
+ /* Hide the last segment, containing the hash, if not verbose. */
+ rdm.sym_len -= 19;
+ }
+
+ do
+ {
+ if (rdm.next > 0)
+ print_str (&rdm, "::", 2);
+
+ ident = parse_ident (&rdm);
+ print_ident (&rdm, ident);
+ }
+ while (rdm.next < rdm.sym_len);
+ }
+ else
+ return 0;
+
+ return !rdm.errored;
+}
+
+/* Growable string buffers. */
+struct str_buf
+{
+ char *ptr;
+ size_t len;
+ size_t cap;
+ int errored;
+};
+
+static void
+str_buf_reserve (struct str_buf *buf, size_t extra)
+{
+ size_t available, min_new_cap, new_cap;
+ char *new_ptr;
+
+ /* Allocation failed before. */
+ if (buf->errored)
+ return;
+
+ available = buf->cap - buf->len;
+
+ if (extra <= available)
+ return;
+
+ min_new_cap = buf->cap + (extra - available);
+
+ /* Check for overflows. */
+ if (min_new_cap < buf->cap)
+ {
+ buf->errored = 1;
+ return;
+ }
+
+ new_cap = buf->cap;
+
+ if (new_cap == 0)
+ new_cap = 4;
+
+ /* Double capacity until sufficiently large. */
+ while (new_cap < min_new_cap)
+ {
+ new_cap *= 2;
+
+ /* Check for overflows. */
+ if (new_cap < buf->cap)
+ {
+ buf->errored = 1;
+ return;
+ }
+ }
+
+ new_ptr = (char *)realloc (buf->ptr, new_cap);
+ if (new_ptr == NULL)
+ {
+ free (buf->ptr);
+ buf->ptr = NULL;
+ buf->len = 0;
+ buf->cap = 0;
+ buf->errored = 1;
+ }
+ else
+ {
+ buf->ptr = new_ptr;
+ buf->cap = new_cap;
+ }
+}
+
+static void
+str_buf_append (struct str_buf *buf, const char *data, size_t len)
+{
+ str_buf_reserve (buf, len);
+ if (buf->errored)
+ return;
+
+ memcpy (buf->ptr + buf->len, data, len);
+ buf->len += len;
+}
+
+static void
+str_buf_demangle_callback (const char *data, size_t len, void *opaque)
+{
+ str_buf_append ((struct str_buf *)opaque, data, len);
+}
+
+char *
+rust_demangle (const char *mangled, int options)
+{
+ struct str_buf out;
+ int success;
+
+ out.ptr = NULL;
+ out.len = 0;
+ out.cap = 0;
+ out.errored = 0;
+
+ success = rust_demangle_callback (mangled, options,
+ str_buf_demangle_callback, &out);
+
+ if (!success)
+ {
+ free (out.ptr);
+ return NULL;
+ }
+
+ str_buf_append (&out, "\0", 1);
+ return out.ptr;
+}