diff options
Diffstat (limited to 'clang/tools/include-mapping/cppreference_parser.py')
-rw-r--r-- | clang/tools/include-mapping/cppreference_parser.py | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py new file mode 100644 index 0000000..e56c8a5 --- /dev/null +++ b/clang/tools/include-mapping/cppreference_parser.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +#===- cppreference_parser.py - ------------------------------*- python -*--===# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===------------------------------------------------------------------------===# + +from bs4 import BeautifulSoup, NavigableString + +import collections +import multiprocessing +import os +import re +import signal +import sys + + +class Symbol: + + def __init__(self, name, namespace, headers): + # unqualifed symbol name, e.g. "move" + self.name = name + # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) + # None for C symbols. + self.namespace = namespace + # a list of corresponding headers + self.headers = headers + + +def _HasClass(tag, *classes): + for c in tag.get('class', []): + if c in classes: + return True + return False + + +def _ParseSymbolPage(symbol_page_html, symbol_name): + """Parse symbol page and retrieve the include header defined in this page. + The symbol page provides header for the symbol, specifically in + "Defined in header <header>" section. An example: + + <tr class="t-dsc-header"> + <td colspan="2"> <div>Defined in header <code><ratio></code> </div> + </td></tr> + + Returns a list of headers. + """ + headers = set() + all_headers = set() + + soup = BeautifulSoup(symbol_page_html, "html.parser") + # Rows in table are like: + # Defined in header <foo> .t-dsc-header + # Defined in header <bar> .t-dsc-header + # decl1 .t-dcl + # Defined in header <baz> .t-dsc-header + # decl2 .t-dcl + for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): + current_headers = [] + was_decl = False + for row in table.select('tr'): + if _HasClass(row, 't-dcl', 't-dsc'): + was_decl = True + # Symbols are in the first cell. + found_symbols = row.find('td').stripped_strings + if not symbol_name in found_symbols: + continue + headers.update(current_headers) + elif _HasClass(row, 't-dsc-header'): + # If we saw a decl since the last header, this is a new block of headers + # for a new block of decls. + if was_decl: + current_headers = [] + was_decl = False + # There are also .t-dsc-header for "defined in namespace". + if not "Defined in header " in row.text: + continue + # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. + for header_code in row.find_all("code"): + current_headers.append(header_code.text) + all_headers.add(header_code.text) + # If the symbol was never named, consider all named headers. + return headers or all_headers + + +def _ParseIndexPage(index_page_html): + """Parse index page. + The index page lists all std symbols and hrefs to their detailed pages + (which contain the defined header). An example: + + <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> + <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> + + Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). + """ + symbols = [] + soup = BeautifulSoup(index_page_html, "html.parser") + for symbol_href in soup.select("a[title]"): + # Ignore annotated symbols like "acos<>() (std::complex)". + # These tend to be overloads, and we the primary is more useful. + # This accidentally accepts begin/end despite the (iterator) caption: the + # (since C++11) note is first. They are good symbols, so the bug is unfixed. + caption = symbol_href.next_sibling + variant = None + if isinstance(caption, NavigableString) and "(" in caption: + variant = caption.text.strip(" ()") + symbol_tt = symbol_href.find("tt") + if symbol_tt: + symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() + symbol_href["href"], variant)) + return symbols + + +def _ReadSymbolPage(path, name): + with open(path) as f: + return _ParseSymbolPage(f.read(), name) + + +def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): + """Get all symbols listed in the index page. All symbols should be in the + given namespace. + + Returns a list of Symbols. + """ + + # Workflow steps: + # 1. Parse index page which lists all symbols to get symbol + # name (unqualified name) and its href link to the symbol page which + # contains the defined header. + # 2. Parse the symbol page to get the defined header. + index_page_path = os.path.join(root_dir, index_page_name) + with open(index_page_path, "r") as f: + # Read each symbol page in parallel. + results = [] # (symbol_name, promise of [header...]) + for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): + # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. + # FIXME: use these as a fallback rather than ignoring entirely. + variants_for_symbol = variants_to_accept.get( + (namespace or "") + symbol_name, ()) + if variant and variant not in variants_for_symbol: + continue + path = os.path.join(root_dir, symbol_page_path) + results.append((symbol_name, + pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) + + # Build map from symbol name to a set of headers. + symbol_headers = collections.defaultdict(set) + for symbol_name, lazy_headers in results: + symbol_headers[symbol_name].update(lazy_headers.get()) + + symbols = [] + for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): + symbols.append(Symbol(name, namespace, list(headers))) + return symbols + + +def GetSymbols(parse_pages): + """Get all symbols by parsing the given pages. + + Args: + parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) + """ + # By default we prefer the non-variant versions, as they're more common. But + # there are some symbols, whose variant is more common. This list describes + # those symbols. + variants_to_accept = { + # std::remove<> has variant algorithm. + "std::remove": ("algorithm"), + } + symbols = [] + # Run many workers to process individual symbol pages under the symbol index. + # Don't allow workers to capture Ctrl-C. + pool = multiprocessing.Pool( + initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) + try: + for root_dir, page_name, namespace in parse_pages: + symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace, + variants_to_accept)) + finally: + pool.terminate() + pool.join() + return symbols |