From e6e6184bed490403811771fa527eb95b4ae53c7c Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 22 Sep 2022 12:10:41 +0200 Subject: scripts: Enhance glibcpp to do basic macro processing Reviewed-by: Siddhesh Poyarekar --- scripts/glibcpp.py | 317 +++++++++++++++++++++++++++++++++++++++++++++++++ support/Makefile | 10 +- support/tst-glibcpp.py | 217 +++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 2 deletions(-) create mode 100644 support/tst-glibcpp.py diff --git a/scripts/glibcpp.py b/scripts/glibcpp.py index b44c6a4..455459a 100644 --- a/scripts/glibcpp.py +++ b/scripts/glibcpp.py @@ -33,7 +33,9 @@ Accepts non-ASCII characters only within comments and strings. """ import collections +import operator import re +import sys # Caution: The order of the outermost alternation matters. # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST, @@ -210,3 +212,318 @@ def tokenize_c(file_contents, reporter): yield tok pos = mo.end() + +class MacroDefinition(collections.namedtuple('MacroDefinition', + 'name_token args body error')): + """A preprocessor macro definition. + + name_token is the Token_ for the name. + + args is None for a macro that is not function-like. Otherwise, it + is a tuple that contains the macro argument name tokens. + + body is a tuple that contains the tokens that constitue the body + of the macro definition (excluding whitespace). + + error is None if no error was detected, or otherwise a problem + description associated with this macro definition. + + """ + + @property + def function(self): + """Return true if the macro is function-like.""" + return self.args is not None + + @property + def name(self): + """Return the name of the macro being defined.""" + return self.name_token.text + + @property + def line(self): + """Return the line number of the macro defintion.""" + return self.name_token.line + + @property + def args_lowered(self): + """Return the macro argument list as a list of strings""" + if self.function: + return [token.text for token in self.args] + else: + return None + + @property + def body_lowered(self): + """Return the macro body as a list of strings.""" + return [token.text for token in self.body] + +def macro_definitions(tokens): + """A generator for C macro definitions among tokens. + + The generator yields MacroDefinition objects. + + tokens must be iterable, yielding Token_ objects. + + """ + + macro_name = None + macro_start = False # Set to false after macro name and one otken. + macro_args = None # Set to a list during the macro argument sequence. + in_macro_args = False # True while processing macro identifier-list. + error = None + body = [] + + for token in tokens: + if token.context == 'define' and macro_name is None \ + and token.kind == 'IDENT': + # Starting up macro processing. + if macro_start: + # First identifier is the macro name. + macro_name = token + else: + # Next token is the name. + macro_start = True + continue + + if macro_name is None: + # Drop tokens not in macro definitions. + continue + + if token.context != 'define': + # End of the macro definition. + if in_macro_args and error is None: + error = 'macro definition ends in macro argument list' + yield MacroDefinition(macro_name, macro_args, tuple(body), error) + # No longer in a macro definition. + macro_name = None + macro_start = False + macro_args = None + in_macro_args = False + error = None + body.clear() + continue + + if macro_start: + # First token after the macro name. + macro_start = False + if token.kind == 'PUNCTUATOR' and token.text == '(': + macro_args = [] + in_macro_args = True + continue + + if in_macro_args: + if token.kind == 'IDENT' \ + or (token.kind == 'PUNCTUATOR' and token.text == '...'): + # Macro argument or ... placeholder. + macro_args.append(token) + if token.kind == 'PUNCTUATOR': + if token.text == ')': + macro_args = tuple(macro_args) + in_macro_args = False + elif token.text == ',': + pass # Skip. Not a full syntax check. + elif error is None: + error = 'invalid punctuator in macro argument list: ' \ + + repr(token.text) + elif error is None: + error = 'invalid {} token in macro argument list'.format( + token.kind) + continue + + if token.kind not in ('WHITESPACE', 'BLOCK_COMMENT'): + body.append(token) + + # Emit the macro in case the last line does not end with a newline. + if macro_name is not None: + if in_macro_args and error is None: + error = 'macro definition ends in macro argument list' + yield MacroDefinition(macro_name, macro_args, tuple(body), error) + +# Used to split UL etc. suffixes from numbers such as 123UL. +RE_SPLIT_INTEGER_SUFFIX = re.compile(r'([^ullULL]+)([ullULL]*)') + +BINARY_OPERATORS = { + '+': operator.add, + '<<': operator.lshift, +} + +# Use the general-purpose dict type if it is order-preserving. +if (sys.version_info[0], sys.version_info[1]) <= (3, 6): + OrderedDict = collections.OrderedDict +else: + OrderedDict = dict + +def macro_eval(macro_defs, reporter): + """Compute macro values + + macro_defs is the output from macro_definitions. reporter is an + object that accepts reporter.error(line_number, message) and + reporter.note(line_number, message) calls to report errors + and error context invocations. + + The returned dict contains the values of macros which are not + function-like, pairing their names with their computed values. + + The current implementation is incomplete. It is deliberately not + entirely faithful to C, even in the implemented parts. It checks + that macro replacements follow certain syntactic rules even if + they are never evaluated. + + """ + + # Unevaluated macro definitions by name. + definitions = OrderedDict() + for md in macro_defs: + if md.name in definitions: + reporter.error(md.line, 'macro {} redefined'.format(md.name)) + reporter.note(definitions[md.name].line, + 'location of previous definition') + else: + definitions[md.name] = md + + # String to value mappings for fully evaluated macros. + evaluated = OrderedDict() + + # String to macro definitions during evaluation. Nice error + # reporting relies on determinstic iteration order. + stack = OrderedDict() + + def eval_token(current, token): + """Evaluate one macro token. + + Integers and strings are returned as such (the latter still + quoted). Identifiers are expanded. + + None indicates an empty expansion or an error. + + """ + + if token.kind == 'PP_NUMBER': + value = None + m = RE_SPLIT_INTEGER_SUFFIX.match(token.text) + if m: + try: + value = int(m.group(1), 0) + except ValueError: + pass + if value is None: + reporter.error(token.line, + 'invalid number {!r} in definition of {}'.format( + token.text, current.name)) + return value + + if token.kind == 'STRING': + return token.text + + if token.kind == 'CHARCONST' and len(token.text) == 3: + return ord(token.text[1]) + + if token.kind == 'IDENT': + name = token.text + result = eval1(current, name) + if name not in evaluated: + evaluated[name] = result + return result + + reporter.error(token.line, + 'unrecognized {!r} in definition of {}'.format( + token.text, current.name)) + return None + + + def eval1(current, name): + """Evaluate one name. + + The name is looked up and the macro definition evaluated + recursively if necessary. The current argument is the macro + definition being evaluated. + + None as a return value indicates an error. + + """ + + # Fast path if the value has already been evaluated. + if name in evaluated: + return evaluated[name] + + try: + md = definitions[name] + except KeyError: + reporter.error(current.line, + 'reference to undefined identifier {} in definition of {}' + .format(name, current.name)) + return None + + if md.name in stack: + # Recursive macro definition. + md = stack[name] + reporter.error(md.line, + 'macro definition {} refers to itself'.format(md.name)) + for md1 in reversed(list(stack.values())): + if md1 is md: + break + reporter.note(md1.line, + 'evaluated from {}'.format(md1.name)) + return None + + stack[md.name] = md + if md.function: + reporter.error(current.line, + 'attempt to evaluate function-like macro {}'.format(name)) + reporter.note(md.line, 'definition of {}'.format(md.name)) + return None + + try: + body = md.body + if len(body) == 0: + # Empty expansion. + return None + + # Remove surrounding (). + if body[0].text == '(' and body[-1].text == ')': + body = body[1:-1] + had_parens = True + else: + had_parens = False + + if len(body) == 1: + return eval_token(md, body[0]) + + # Minimal expression evaluator for binary operators. + op = body[1].text + if len(body) == 3 and op in BINARY_OPERATORS: + if not had_parens: + reporter.error(body[1].line, + 'missing parentheses around {} expression'.format(op)) + reporter.note(md.line, + 'in definition of macro {}'.format(md.name)) + + left = eval_token(md, body[0]) + right = eval_token(md, body[2]) + + if type(left) != type(1): + reporter.error(left.line, + 'left operand of {} is not an integer'.format(op)) + reporter.note(md.line, + 'in definition of macro {}'.format(md.name)) + if type(right) != type(1): + reporter.error(left.line, + 'right operand of {} is not an integer'.format(op)) + reporter.note(md.line, + 'in definition of macro {}'.format(md.name)) + return BINARY_OPERATORS[op](left, right) + + reporter.error(md.line, + 'uninterpretable macro token sequence: {}'.format( + ' '.join(md.body_lowered))) + return None + finally: + del stack[md.name] + + # Start of main body of macro_eval. + for md in definitions.values(): + name = md.name + if name not in evaluated and not md.function: + evaluated[name] = eval1(md, name) + return evaluated diff --git a/support/Makefile b/support/Makefile index 9b50eac..551d029 100644 --- a/support/Makefile +++ b/support/Makefile @@ -274,12 +274,12 @@ $(objpfx)test-run-command : $(libsupport) $(common-objpfx)elf/static-stubs.o tests = \ README-testing \ tst-support-namespace \ + tst-support-open-dev-null-range \ + tst-support-process_state \ tst-support_blob_repeat \ tst-support_capture_subprocess \ tst-support_descriptors \ tst-support_format_dns_packet \ - tst-support-open-dev-null-range \ - tst-support-process_state \ tst-support_quote_blob \ tst-support_quote_blob_wide \ tst-support_quote_string \ @@ -304,6 +304,12 @@ $(objpfx)tst-support_record_failure-2.out: tst-support_record_failure-2.sh \ $(evaluate-test) endif +tests-special += $(objpfx)tst-glibcpp.out + +$(objpfx)tst-glibcpp.out: tst-glibcpp.py $(..)scripts/glibcpp.py + PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcpp.py > $@ 2>&1; \ + $(evaluate-test) + $(objpfx)tst-support_format_dns_packet: $(common-objpfx)resolv/libresolv.so tst-support_capture_subprocess-ARGS = -- $(host-test-program-cmd) diff --git a/support/tst-glibcpp.py b/support/tst-glibcpp.py new file mode 100644 index 0000000..a2db191 --- /dev/null +++ b/support/tst-glibcpp.py @@ -0,0 +1,217 @@ +#! /usr/bin/python3 +# Tests for scripts/glibcpp.py +# Copyright (C) 2022 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +import inspect +import sys + +import glibcpp + +# Error counter. +errors = 0 + +class TokenizerErrors: + """Used as the error reporter during tokenization.""" + + def __init__(self): + self.errors = [] + + def error(self, token, message): + self.errors.append((token, message)) + +def check_macro_definitions(source, expected): + reporter = TokenizerErrors() + tokens = glibcpp.tokenize_c(source, reporter) + + actual = [] + for md in glibcpp.macro_definitions(tokens): + if md.function: + md_name = '{}({})'.format(md.name, ','.join(md.args_lowered)) + else: + md_name = md.name + actual.append((md_name, md.body_lowered)) + + if actual != expected or reporter.errors: + global errors + errors += 1 + # Obtain python source line information. + frame = inspect.stack(2)[1] + print('{}:{}: error: macro definition mismatch, actual definitions:' + .format(frame[1], frame[2])) + for md in actual: + print('note: {} {!r}'.format(md[0], md[1])) + + if reporter.errors: + for err in reporter.errors: + print('note: tokenizer error: {}: {}'.format( + err[0].line, err[1])) + +def check_macro_eval(source, expected, expected_errors=''): + reporter = TokenizerErrors() + tokens = list(glibcpp.tokenize_c(source, reporter)) + + if reporter.errors: + # Obtain python source line information. + frame = inspect.stack(2)[1] + for err in reporter.errors: + print('{}:{}: tokenizer error: {}: {}'.format( + frame[1], frame[2], err[0].line, err[1])) + return + + class EvalReporter: + """Used as the error reporter during evaluation.""" + + def __init__(self): + self.lines = [] + + def error(self, line, message): + self.lines.append('{}: error: {}\n'.format(line, message)) + + def note(self, line, message): + self.lines.append('{}: note: {}\n'.format(line, message)) + + reporter = EvalReporter() + actual = glibcpp.macro_eval(glibcpp.macro_definitions(tokens), reporter) + actual_errors = ''.join(reporter.lines) + if actual != expected or actual_errors != expected_errors: + global errors + errors += 1 + # Obtain python source line information. + frame = inspect.stack(2)[1] + print('{}:{}: error: macro evaluation mismatch, actual results:' + .format(frame[1], frame[2])) + for k, v in actual.items(): + print(' {}: {!r}'.format(k, v)) + for msg in reporter.lines: + sys.stdout.write(' | ' + msg) + +# Individual test cases follow. + +check_macro_definitions('', []) +check_macro_definitions('int main()\n{\n{\n', []) +check_macro_definitions(""" +#define A 1 +#define B 2 /* ignored */ +#define C 3 // also ignored +#define D \ + 4 +#define STRING "string" +#define FUNCLIKE(a, b) (a + b) +#define FUNCLIKE2(a, b) (a + \ + b) +""", [('A', ['1']), + ('B', ['2']), + ('C', ['3']), + ('D', ['4']), + ('STRING', ['"string"']), + ('FUNCLIKE(a,b)', list('(a+b)')), + ('FUNCLIKE2(a,b)', list('(a+b)')), + ]) +check_macro_definitions('#define MACRO', [('MACRO', [])]) +check_macro_definitions('#define MACRO\n', [('MACRO', [])]) +check_macro_definitions('#define MACRO()', [('MACRO()', [])]) +check_macro_definitions('#define MACRO()\n', [('MACRO()', [])]) + +check_macro_eval('#define A 1', {'A': 1}) +check_macro_eval('#define A (1)', {'A': 1}) +check_macro_eval('#define A (1 + 1)', {'A': 2}) +check_macro_eval('#define A (1U << 31)', {'A': 1 << 31}) +check_macro_eval('''\ +#define A (B + 1) +#define B 10 +#define F(x) ignored +#define C "not ignored" +''', { + 'A': 11, + 'B': 10, + 'C': '"not ignored"', +}) + +# Checking for evaluation errors. +check_macro_eval('''\ +#define A 1 +#define A 2 +''', { + 'A': 1, +}, '''\ +2: error: macro A redefined +1: note: location of previous definition +''') + +check_macro_eval('''\ +#define A A +#define B 1 +''', { + 'A': None, + 'B': 1, +}, '''\ +1: error: macro definition A refers to itself +''') + +check_macro_eval('''\ +#define A B +#define B A +''', { + 'A': None, + 'B': None, +}, '''\ +1: error: macro definition A refers to itself +2: note: evaluated from B +''') + +check_macro_eval('''\ +#define A B +#define B C +#define C A +''', { + 'A': None, + 'B': None, + 'C': None, +}, '''\ +1: error: macro definition A refers to itself +3: note: evaluated from C +2: note: evaluated from B +''') + +check_macro_eval('''\ +#define A 1 + +''', { + 'A': None, +}, '''\ +1: error: uninterpretable macro token sequence: 1 + +''') + +check_macro_eval('''\ +#define A 3*5 +''', { + 'A': None, +}, '''\ +1: error: uninterpretable macro token sequence: 3 * 5 +''') + +check_macro_eval('''\ +#define A 3 + 5 +''', { + 'A': 8, +}, '''\ +1: error: missing parentheses around + expression +1: note: in definition of macro A +''') + +if errors: + sys.exit(1) -- cgit v1.1