From 11ef2a536c6c9fec0b048da74b36a0231ef2199a Mon Sep 17 00:00:00 2001 From: Charles Brunet Date: Fri, 25 Aug 2023 10:08:24 -0400 Subject: parser: preserve whitespaces and comments --- mesonbuild/mparser.py | 67 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 7 deletions(-) (limited to 'mesonbuild') diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py index 75a12aa..a161842 100644 --- a/mesonbuild/mparser.py +++ b/mesonbuild/mparser.py @@ -116,7 +116,7 @@ class Lexer: self.keywords.update({'testcase', 'endtestcase'}) self.token_specification = [ # Need to be sorted longest to shortest. - ('ignore', re.compile(r'[ \t]')), + ('whitespace', re.compile(r'[ \t]+')), ('multiline_fstring', re.compile(r"f'''(.|\n)*?'''", re.M)), ('fstring', re.compile(r"f'([^'\\]|(\\.))*'")), ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')), @@ -178,9 +178,7 @@ class Lexer: span_end = loc bytespan = (span_start, span_end) value = mo.group() - if tid in {'ignore', 'comment'}: - break - elif tid == 'lparen': + if tid == 'lparen': par_count += 1 elif tid == 'rparen': par_count -= 1 @@ -210,12 +208,12 @@ class Lexer: elif tid == 'eol_cont': lineno += 1 line_start = loc - break + tid = 'whitespace' elif tid == 'eol': lineno += 1 line_start = loc if par_count > 0 or bracket_count > 0 or curl_count > 0: - break + tid = 'whitespace' elif tid == 'id': if value in self.keywords: tid = value @@ -235,6 +233,7 @@ class BaseNode: filename: str = field(hash=False) end_lineno: int = field(hash=False) end_colno: int = field(hash=False) + whitespaces: T.Optional[WhitespaceNode] = field(hash=False) def __init__(self, lineno: int, colno: int, filename: str, end_lineno: T.Optional[int] = None, end_colno: T.Optional[int] = None) -> None: @@ -257,6 +256,26 @@ class BaseNode: if callable(func): func(self) + def append_whitespaces(self, token: Token) -> None: + if self.whitespaces is None: + self.whitespaces = WhitespaceNode(token) + else: + self.whitespaces.append(token) + + +@dataclass(unsafe_hash=True) +class WhitespaceNode(BaseNode): + + value: str + + def __init__(self, token: Token[str]): + super().__init__(token.lineno, token.colno, token.filename) + self.value = '' + self.append(token) + + def append(self, token: Token[str]) -> None: + self.value += token.value + @dataclass(unsafe_hash=True) class ElementaryNode(T.Generic[TV_TokenTypes], BaseNode): @@ -456,6 +475,7 @@ class UMinusNode(UnaryOperatorNode): @dataclass(unsafe_hash=True) class CodeBlockNode(BaseNode): + pre_whitespaces: T.Optional[WhitespaceNode] = field(hash=False) lines: T.List[BaseNode] = field(hash=False) def __init__(self, token: Token[TV_TokenTypes]): @@ -463,6 +483,14 @@ class CodeBlockNode(BaseNode): self.pre_whitespaces = None self.lines = [] + def append_whitespaces(self, token: Token) -> None: + if self.lines: + self.lines[-1].append_whitespaces(token) + elif self.pre_whitespaces is None: + self.pre_whitespaces = WhitespaceNode(token) + else: + self.pre_whitespaces.append(token) + @dataclass(unsafe_hash=True) class IndexNode(BaseNode): @@ -669,12 +697,16 @@ class Parser: self.stream = self.lexer.lex(filename) self.current: Token = Token('eof', '', 0, 0, 0, (0, 0), None) self.previous = self.current + self.current_ws: T.List[Token] = [] self.getsym() self.in_ternary = False def create_node(self, node_type: T.Type[BaseNodeT], *args: T.Any, **kwargs: T.Any) -> BaseNodeT: node = node_type(*args, **kwargs) + for ws_token in self.current_ws: + node.append_whitespaces(ws_token) + self.current_ws = [] return node def getsym(self) -> None: @@ -682,6 +714,12 @@ class Parser: try: self.current = next(self.stream) + while self.current.tid in {'eol', 'comment', 'whitespace'}: + self.current_ws.append(self.current) + if self.current.tid == 'eol': + break + self.current = next(self.stream) + except StopIteration: self.current = Token('eof', '', self.current.line_start, self.current.lineno, self.current.colno + self.current.bytespan[1] - self.current.bytespan[0], (0, 0), None) @@ -782,11 +820,17 @@ class Parser: operator = self.create_node(SymbolNode, self.previous) return self.create_node(ComparisonNode, operator_type, left, operator, self.e5()) if self.accept('not'): + ws = self.current_ws.copy() not_token = self.previous if self.accept('in'): in_token = self.previous + self.current_ws = self.current_ws[len(ws):] # remove whitespaces between not and in + temp_node = EmptyNode(in_token.lineno, in_token.colno, in_token.filename) + for w in ws: + temp_node.append_whitespaces(w) + not_token.bytespan = (not_token.bytespan[0], in_token.bytespan[1]) - not_token.value += in_token.value + not_token.value += temp_node.whitespaces.value + in_token.value operator = self.create_node(SymbolNode, not_token) return self.create_node(ComparisonNode, 'notin', left, operator, self.e5()) return left @@ -1054,6 +1098,10 @@ class Parser: try: while cond: + for ws_token in self.current_ws: + block.append_whitespaces(ws_token) + self.current_ws = [] + curline = self.line() if not isinstance(curline, EmptyNode): @@ -1065,4 +1113,9 @@ class Parser: e.ast = block raise + # Remaining whitespaces will not be catched since there are no more nodes + for ws_token in self.current_ws: + block.append_whitespaces(ws_token) + self.current_ws = [] + return block -- cgit v1.1