Some experiments with a self-written parser.

author: Jussi Pakkanen <jpakkane@gmail.com> 2014-03-12 19:40:39 +0200
committer: Jussi Pakkanen <jpakkane@gmail.com> 2014-03-12 19:40:39 +0200
commit: fc42ae04507d6723d0331a9f9b71d3328ec56ce4 (patch)
tree: aff0151269b6c23edc64cf3fb77b10f5856fc936
parent: c7865cd98f0a420845cb50cf77e1f0c8f266475f (diff)
download: meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.zip
meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.tar.gz
meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.tar.bz2
1 files changed, 106 insertions, 0 deletions
diff --git a/parsertest.py b/parsertest.py
new file mode 100755
index 0000000..704c188
--- /dev/null
+++ b/parsertest.py
@@ -0,0 +1,106 @@
+#!/usr/bin/python3
+
+# Copyright 2014 Jussi Pakkanen
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+
+class ParseException(Exception):
+    def __init__(self, lineno, colno):
+        super().__init__()
+        self.lineno = lineno
+        self.colno = colno
+
+class Token:
+    def __init__(self, id, lineno, colno):
+        self.id = id
+        self.lineno = lineno
+        self.colno = colno
+
+class Lexer:
+    def __init__(self):
+        self.keywords = {'true', 'false', 'if', 'else', 'elif',
+                         'endif', 'and', 'or', 'not'}
+        self.token_specification = [
+            # Need to be sorted longest to shortest.
+            ('ignore', re.compile(r'[ \t]')),
+            ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')),
+            ('number', re.compile(r'\d+')),
+            ('eol_cont', re.compile(r'\\\n')),
+            ('eol', re.compile(r'\n')),
+            ('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)),
+            ('comment', re.compile(r'\#.*')),
+            ('lparen', re.compile(r'\(')),
+            ('rparen', re.compile(r'\)')),
+            ('lbracket', re.compile(r'\[')),
+            ('lbracket', re.compile(r'\]')),
+            ('string', re.compile("'[^']*?'")),
+            ('comma', re.compile(r',')),
+            ('dot', re.compile(r'\.')),
+            ('semicolon', re.compile(r':')),
+            ('assign', re.compile(r'==')),
+            ('equal', re.compile(r'=')),
+            ('nequals', re.compile(r'\!=')),
+        ]
+
+    def lex(self, code):
+        lineno = 1
+        line_start = 0
+        loc = 0;
+        par_count = 0
+        bracket_count = 0
+        col = 0
+        while(loc < len(code)):
+            matched = False
+            for (tid, reg) in self.token_specification:
+                mo = reg.match(code, loc)
+                if mo:
+                    curline = lineno
+                    col = mo.start()-line_start
+                    matched = True
+                    loc = mo.end()
+                    match_text = mo.group()
+                    if tid == 'ignore':
+                        break
+                    elif tid == 'lparen':
+                        par_count += 1
+                    elif tid == 'rparen':
+                        par_count -= 1
+                    elif tid == 'lbracket':
+                        bracket_count += 1
+                    elif tid == 'rbracket':
+                        bracket_count -= 1
+                    elif tid == 'multiline_string':
+                        lines = match_text.split('\n')
+                        if len(lines) > 1:
+                            lineno += len(lines) - 1
+                            line_start = mo.end() - len(lines[-1])
+                    elif tid == 'eol' or tid == 'eol_cont':
+                        lineno += 1
+                        line_start = loc
+                        if par_count > 0 or bracket_count > 0:
+                            break
+                    yield Token(tid, curline, col)
+            if not matched:
+                raise ParseException(lineno, col)
+
+if __name__ == '__main__':
+    code = open(sys.argv[1]).read()
+    lex = Lexer()
+    try:
+        for i in lex.lex(code):
+            print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno)
+    except ParseException as e:
+        print('Error line', e.lineno, 'column', e.colno)
+\ No newline at end of file
author	Jussi Pakkanen <jpakkane@gmail.com>	2014-03-12 19:40:39 +0200
committer	Jussi Pakkanen <jpakkane@gmail.com>	2014-03-12 19:40:39 +0200
commit	fc42ae04507d6723d0331a9f9b71d3328ec56ce4 (patch)
tree	aff0151269b6c23edc64cf3fb77b10f5856fc936
parent	c7865cd98f0a420845cb50cf77e1f0c8f266475f (diff)
download	meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.zip meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.tar.gz meson-fc42ae04507d6723d0331a9f9b71d3328ec56ce4.tar.bz2