From 11ef2a536c6c9fec0b048da74b36a0231ef2199a Mon Sep 17 00:00:00 2001
From: Charles Brunet <charles.brunet@optelgroup.com>
Date: Fri, 25 Aug 2023 10:08:24 -0400
Subject: parser: preserve whitespaces and comments

---
 mesonbuild/mparser.py | 67 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 7 deletions(-)

(limited to 'mesonbuild')

diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py
index 75a12aa..a161842 100644
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@@ -116,7 +116,7 @@ class Lexer:
             self.keywords.update({'testcase', 'endtestcase'})
         self.token_specification = [
             # Need to be sorted longest to shortest.
-            ('ignore', re.compile(r'[ \t]')),
+            ('whitespace', re.compile(r'[ \t]+')),
             ('multiline_fstring', re.compile(r"f'''(.|\n)*?'''", re.M)),
             ('fstring', re.compile(r"f'([^'\\]|(\\.))*'")),
             ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')),
@@ -178,9 +178,7 @@ class Lexer:
                     span_end = loc
                     bytespan = (span_start, span_end)
                     value = mo.group()
-                    if tid in {'ignore', 'comment'}:
-                        break
-                    elif tid == 'lparen':
+                    if tid == 'lparen':
                         par_count += 1
                     elif tid == 'rparen':
                         par_count -= 1
@@ -210,12 +208,12 @@ class Lexer:
                     elif tid == 'eol_cont':
                         lineno += 1
                         line_start = loc
-                        break
+                        tid = 'whitespace'
                     elif tid == 'eol':
                         lineno += 1
                         line_start = loc
                         if par_count > 0 or bracket_count > 0 or curl_count > 0:
-                            break
+                            tid = 'whitespace'
                     elif tid == 'id':
                         if value in self.keywords:
                             tid = value
@@ -235,6 +233,7 @@ class BaseNode:
     filename: str = field(hash=False)
     end_lineno: int = field(hash=False)
     end_colno: int = field(hash=False)
+    whitespaces: T.Optional[WhitespaceNode] = field(hash=False)
 
     def __init__(self, lineno: int, colno: int, filename: str,
                  end_lineno: T.Optional[int] = None, end_colno: T.Optional[int] = None) -> None:
@@ -257,6 +256,26 @@ class BaseNode:
             if callable(func):
                 func(self)
 
+    def append_whitespaces(self, token: Token) -> None:
+        if self.whitespaces is None:
+            self.whitespaces = WhitespaceNode(token)
+        else:
+            self.whitespaces.append(token)
+
+
+@dataclass(unsafe_hash=True)
+class WhitespaceNode(BaseNode):
+
+    value: str
+
+    def __init__(self, token: Token[str]):
+        super().__init__(token.lineno, token.colno, token.filename)
+        self.value = ''
+        self.append(token)
+
+    def append(self, token: Token[str]) -> None:
+        self.value += token.value
+
 @dataclass(unsafe_hash=True)
 class ElementaryNode(T.Generic[TV_TokenTypes], BaseNode):
 
@@ -456,6 +475,7 @@ class UMinusNode(UnaryOperatorNode):
 @dataclass(unsafe_hash=True)
 class CodeBlockNode(BaseNode):
 
+    pre_whitespaces: T.Optional[WhitespaceNode] = field(hash=False)
     lines: T.List[BaseNode] = field(hash=False)
 
     def __init__(self, token: Token[TV_TokenTypes]):
@@ -463,6 +483,14 @@ class CodeBlockNode(BaseNode):
         self.pre_whitespaces = None
         self.lines = []
 
+    def append_whitespaces(self, token: Token) -> None:
+        if self.lines:
+            self.lines[-1].append_whitespaces(token)
+        elif self.pre_whitespaces is None:
+            self.pre_whitespaces = WhitespaceNode(token)
+        else:
+            self.pre_whitespaces.append(token)
+
 @dataclass(unsafe_hash=True)
 class IndexNode(BaseNode):
 
@@ -669,12 +697,16 @@ class Parser:
         self.stream = self.lexer.lex(filename)
         self.current: Token = Token('eof', '', 0, 0, 0, (0, 0), None)
         self.previous = self.current
+        self.current_ws: T.List[Token] = []
 
         self.getsym()
         self.in_ternary = False
 
     def create_node(self, node_type: T.Type[BaseNodeT], *args: T.Any, **kwargs: T.Any) -> BaseNodeT:
         node = node_type(*args, **kwargs)
+        for ws_token in self.current_ws:
+            node.append_whitespaces(ws_token)
+        self.current_ws = []
         return node
 
     def getsym(self) -> None:
@@ -682,6 +714,12 @@ class Parser:
         try:
             self.current = next(self.stream)
 
+            while self.current.tid in {'eol', 'comment', 'whitespace'}:
+                self.current_ws.append(self.current)
+                if self.current.tid == 'eol':
+                    break
+                self.current = next(self.stream)
+
         except StopIteration:
             self.current = Token('eof', '', self.current.line_start, self.current.lineno, self.current.colno + self.current.bytespan[1] - self.current.bytespan[0], (0, 0), None)
 
@@ -782,11 +820,17 @@ class Parser:
                 operator = self.create_node(SymbolNode, self.previous)
                 return self.create_node(ComparisonNode, operator_type, left, operator, self.e5())
         if self.accept('not'):
+            ws = self.current_ws.copy()
             not_token = self.previous
             if self.accept('in'):
                 in_token = self.previous
+                self.current_ws = self.current_ws[len(ws):]  # remove whitespaces between not and in
+                temp_node = EmptyNode(in_token.lineno, in_token.colno, in_token.filename)
+                for w in ws:
+                    temp_node.append_whitespaces(w)
+
                 not_token.bytespan = (not_token.bytespan[0], in_token.bytespan[1])
-                not_token.value += in_token.value
+                not_token.value += temp_node.whitespaces.value + in_token.value
                 operator = self.create_node(SymbolNode, not_token)
                 return self.create_node(ComparisonNode, 'notin', left, operator, self.e5())
         return left
@@ -1054,6 +1098,10 @@ class Parser:
 
         try:
             while cond:
+                for ws_token in self.current_ws:
+                    block.append_whitespaces(ws_token)
+                self.current_ws = []
+
                 curline = self.line()
 
                 if not isinstance(curline, EmptyNode):
@@ -1065,4 +1113,9 @@ class Parser:
             e.ast = block
             raise
 
+        # Remaining whitespaces will not be catched since there are no more nodes
+        for ws_token in self.current_ws:
+            block.append_whitespaces(ws_token)
+        self.current_ws = []
+
         return block
-- 
cgit v1.1