diff options
author | Tobias Hieta <tobias@hieta.se> | 2023-05-17 10:56:49 +0200 |
---|---|---|
committer | Tobias Hieta <tobias@hieta.se> | 2023-05-23 08:29:52 +0200 |
commit | dd3c26a045c081620375a878159f536758baba6e (patch) | |
tree | 434af8abf8bcbc53fa834a2aa000c04edc6e8449 /clang/tools/include-mapping/cppreference_parser.py | |
parent | 9a515d8142da2cef27760466f286406e1ce1469c (diff) | |
download | llvm-dd3c26a045c081620375a878159f536758baba6e.zip llvm-dd3c26a045c081620375a878159f536758baba6e.tar.gz llvm-dd3c26a045c081620375a878159f536758baba6e.tar.bz2 |
[NFC][Py Reformat] Reformat python files in clang and clang-tools-extra
This is an ongoing series of commits that are reformatting our
Python code.
Reformatting is done with `black`.
If you end up having problems merging this commit because you
have made changes to a python file, the best way to handle that
is to run git checkout --ours <yourfile> and then reformat it
with black.
If you run into any problems, post to discourse about it and
we will try to help.
RFC Thread below:
https://discourse.llvm.org/t/rfc-document-and-standardize-python-code-style
Reviewed By: MatzeB
Differential Revision: https://reviews.llvm.org/D150761
Diffstat (limited to 'clang/tools/include-mapping/cppreference_parser.py')
-rw-r--r-- | clang/tools/include-mapping/cppreference_parser.py | 323 |
1 files changed, 168 insertions, 155 deletions
diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py index 19bdde7..cefdbea 100644 --- a/clang/tools/include-mapping/cppreference_parser.py +++ b/clang/tools/include-mapping/cppreference_parser.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -#===- cppreference_parser.py - ------------------------------*- python -*--===# +# ===- cppreference_parser.py - ------------------------------*- python -*--===# # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -#===------------------------------------------------------------------------===# +# ===------------------------------------------------------------------------===# from bs4 import BeautifulSoup, NavigableString @@ -18,176 +18,189 @@ import sys class Symbol: + def __init__(self, name, namespace, headers): + # unqualifed symbol name, e.g. "move" + self.name = name + # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) + # None for C symbols. + self.namespace = namespace + # a list of corresponding headers + self.headers = headers - def __init__(self, name, namespace, headers): - # unqualifed symbol name, e.g. "move" - self.name = name - # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) - # None for C symbols. - self.namespace = namespace - # a list of corresponding headers - self.headers = headers - - def __lt__(self, other): - if self.namespace != other.namespace: - return str(self.namespace) < str(other.namespace) - return self.name < other.name + def __lt__(self, other): + if self.namespace != other.namespace: + return str(self.namespace) < str(other.namespace) + return self.name < other.name def _HasClass(tag, *classes): - for c in tag.get('class', []): - if c in classes: - return True - return False + for c in tag.get("class", []): + if c in classes: + return True + return False def _ParseSymbolPage(symbol_page_html, symbol_name): - """Parse symbol page and retrieve the include header defined in this page. - The symbol page provides header for the symbol, specifically in - "Defined in header <header>" section. An example: - - <tr class="t-dsc-header"> - <td colspan="2"> <div>Defined in header <code><ratio></code> </div> - </td></tr> - - Returns a list of headers. - """ - headers = set() - all_headers = set() - - soup = BeautifulSoup(symbol_page_html, "html.parser") - # Rows in table are like: - # Defined in header <foo> .t-dsc-header - # Defined in header <bar> .t-dsc-header - # decl1 .t-dcl - # Defined in header <baz> .t-dsc-header - # decl2 .t-dcl - for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): - current_headers = [] - was_decl = False - for row in table.select('tr'): - if _HasClass(row, 't-dcl', 't-dsc'): - was_decl = True - # Symbols are in the first cell. - found_symbols = row.find('td').stripped_strings - if not symbol_name in found_symbols: - continue - headers.update(current_headers) - elif _HasClass(row, 't-dsc-header'): - # If we saw a decl since the last header, this is a new block of headers - # for a new block of decls. - if was_decl: - current_headers = [] + """Parse symbol page and retrieve the include header defined in this page. + The symbol page provides header for the symbol, specifically in + "Defined in header <header>" section. An example: + + <tr class="t-dsc-header"> + <td colspan="2"> <div>Defined in header <code><ratio></code> </div> + </td></tr> + + Returns a list of headers. + """ + headers = set() + all_headers = set() + + soup = BeautifulSoup(symbol_page_html, "html.parser") + # Rows in table are like: + # Defined in header <foo> .t-dsc-header + # Defined in header <bar> .t-dsc-header + # decl1 .t-dcl + # Defined in header <baz> .t-dsc-header + # decl2 .t-dcl + for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"): + current_headers = [] was_decl = False - # There are also .t-dsc-header for "defined in namespace". - if not "Defined in header " in row.text: - continue - # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. - for header_code in row.find_all("code"): - current_headers.append(header_code.text) - all_headers.add(header_code.text) - # If the symbol was never named, consider all named headers. - return headers or all_headers + for row in table.select("tr"): + if _HasClass(row, "t-dcl", "t-dsc"): + was_decl = True + # Symbols are in the first cell. + found_symbols = row.find("td").stripped_strings + if not symbol_name in found_symbols: + continue + headers.update(current_headers) + elif _HasClass(row, "t-dsc-header"): + # If we saw a decl since the last header, this is a new block of headers + # for a new block of decls. + if was_decl: + current_headers = [] + was_decl = False + # There are also .t-dsc-header for "defined in namespace". + if not "Defined in header " in row.text: + continue + # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. + for header_code in row.find_all("code"): + current_headers.append(header_code.text) + all_headers.add(header_code.text) + # If the symbol was never named, consider all named headers. + return headers or all_headers def _ParseIndexPage(index_page_html): - """Parse index page. - The index page lists all std symbols and hrefs to their detailed pages - (which contain the defined header). An example: - - <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> - <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> - - Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). - """ - symbols = [] - soup = BeautifulSoup(index_page_html, "html.parser") - for symbol_href in soup.select("a[title]"): - # Ignore annotated symbols like "acos<>() (std::complex)". - # These tend to be overloads, and we the primary is more useful. - # This accidentally accepts begin/end despite the (iterator) caption: the - # (since C++11) note is first. They are good symbols, so the bug is unfixed. - caption = symbol_href.next_sibling - variant = None - if isinstance(caption, NavigableString) and "(" in caption: - variant = caption.text.strip(" ()") - symbol_tt = symbol_href.find("tt") - if symbol_tt: - symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() - symbol_href["href"], variant)) - return symbols + """Parse index page. + The index page lists all std symbols and hrefs to their detailed pages + (which contain the defined header). An example: + + <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> + <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> + + Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). + """ + symbols = [] + soup = BeautifulSoup(index_page_html, "html.parser") + for symbol_href in soup.select("a[title]"): + # Ignore annotated symbols like "acos<>() (std::complex)". + # These tend to be overloads, and we the primary is more useful. + # This accidentally accepts begin/end despite the (iterator) caption: the + # (since C++11) note is first. They are good symbols, so the bug is unfixed. + caption = symbol_href.next_sibling + variant = None + if isinstance(caption, NavigableString) and "(" in caption: + variant = caption.text.strip(" ()") + symbol_tt = symbol_href.find("tt") + if symbol_tt: + symbols.append( + ( + symbol_tt.text.rstrip("<>()"), # strip any trailing <>() + symbol_href["href"], + variant, + ) + ) + return symbols def _ReadSymbolPage(path, name): - with open(path) as f: - return _ParseSymbolPage(f.read(), name) + with open(path) as f: + return _ParseSymbolPage(f.read(), name) def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): - """Get all symbols listed in the index page. All symbols should be in the - given namespace. - - Returns a list of Symbols. - """ - - # Workflow steps: - # 1. Parse index page which lists all symbols to get symbol - # name (unqualified name) and its href link to the symbol page which - # contains the defined header. - # 2. Parse the symbol page to get the defined header. - index_page_path = os.path.join(root_dir, index_page_name) - with open(index_page_path, "r") as f: - # Read each symbol page in parallel. - results = [] # (symbol_name, promise of [header...]) - for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): - # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. - # FIXME: use these as a fallback rather than ignoring entirely. - variants_for_symbol = variants_to_accept.get( - (namespace or "") + symbol_name, ()) - if variant and variant not in variants_for_symbol: - continue - path = os.path.join(root_dir, symbol_page_path) - if os.path.isfile(path): - results.append((symbol_name, - pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) - else: - sys.stderr.write("Discarding information for symbol: %s. Page %s does not exist.\n" - % (symbol_name, path)) - - # Build map from symbol name to a set of headers. - symbol_headers = collections.defaultdict(set) - for symbol_name, lazy_headers in results: - symbol_headers[symbol_name].update(lazy_headers.get()) - - symbols = [] - for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): - symbols.append(Symbol(name, namespace, list(headers))) - return symbols + """Get all symbols listed in the index page. All symbols should be in the + given namespace. + + Returns a list of Symbols. + """ + + # Workflow steps: + # 1. Parse index page which lists all symbols to get symbol + # name (unqualified name) and its href link to the symbol page which + # contains the defined header. + # 2. Parse the symbol page to get the defined header. + index_page_path = os.path.join(root_dir, index_page_name) + with open(index_page_path, "r") as f: + # Read each symbol page in parallel. + results = [] # (symbol_name, promise of [header...]) + for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): + # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. + # FIXME: use these as a fallback rather than ignoring entirely. + variants_for_symbol = variants_to_accept.get( + (namespace or "") + symbol_name, () + ) + if variant and variant not in variants_for_symbol: + continue + path = os.path.join(root_dir, symbol_page_path) + if os.path.isfile(path): + results.append( + ( + symbol_name, + pool.apply_async(_ReadSymbolPage, (path, symbol_name)), + ) + ) + else: + sys.stderr.write( + "Discarding information for symbol: %s. Page %s does not exist.\n" + % (symbol_name, path) + ) + + # Build map from symbol name to a set of headers. + symbol_headers = collections.defaultdict(set) + for symbol_name, lazy_headers in results: + symbol_headers[symbol_name].update(lazy_headers.get()) + + symbols = [] + for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]): + symbols.append(Symbol(name, namespace, list(headers))) + return symbols def GetSymbols(parse_pages): - """Get all symbols by parsing the given pages. - - Args: - parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) - """ - # By default we prefer the non-variant versions, as they're more common. But - # there are some symbols, whose variant is more common. This list describes - # those symbols. - variants_to_accept = { - # std::remove<> has variant algorithm. - "std::remove": ("algorithm"), - } - symbols = [] - # Run many workers to process individual symbol pages under the symbol index. - # Don't allow workers to capture Ctrl-C. - pool = multiprocessing.Pool( - initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) - try: - for root_dir, page_name, namespace in parse_pages: - symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace, - variants_to_accept)) - finally: - pool.terminate() - pool.join() - return sorted(symbols) + """Get all symbols by parsing the given pages. + + Args: + parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) + """ + # By default we prefer the non-variant versions, as they're more common. But + # there are some symbols, whose variant is more common. This list describes + # those symbols. + variants_to_accept = { + # std::remove<> has variant algorithm. + "std::remove": ("algorithm"), + } + symbols = [] + # Run many workers to process individual symbol pages under the symbol index. + # Don't allow workers to capture Ctrl-C. + pool = multiprocessing.Pool( + initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN) + ) + try: + for root_dir, page_name, namespace in parse_pages: + symbols.extend( + _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept) + ) + finally: + pool.terminate() + pool.join() + return sorted(symbols) |