#!/usr/bin/python3 # Copyright 2014 Jussi Pakkanen # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import sys class ParseException(Exception): def __init__(self, lineno, colno): super().__init__() self.lineno = lineno self.colno = colno class Token: def __init__(self, id, lineno, colno): self.id = id self.lineno = lineno self.colno = colno class Lexer: def __init__(self): self.keywords = {'true', 'false', 'if', 'else', 'elif', 'endif', 'and', 'or', 'not'} self.token_specification = [ # Need to be sorted longest to shortest. ('ignore', re.compile(r'[ \t]')), ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')), ('number', re.compile(r'\d+')), ('eol_cont', re.compile(r'\\\n')), ('eol', re.compile(r'\n')), ('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)), ('comment', re.compile(r'\#.*')), ('lparen', re.compile(r'\(')), ('rparen', re.compile(r'\)')), ('lbracket', re.compile(r'\[')), ('lbracket', re.compile(r'\]')), ('string', re.compile("'[^']*?'")), ('comma', re.compile(r',')), ('dot', re.compile(r'\.')), ('semicolon', re.compile(r':')), ('assign', re.compile(r'==')), ('equal', re.compile(r'=')), ('nequals', re.compile(r'\!=')), ] def lex(self, code): lineno = 1 line_start = 0 loc = 0; par_count = 0 bracket_count = 0 col = 0 while(loc < len(code)): matched = False for (tid, reg) in self.token_specification: mo = reg.match(code, loc) if mo: curline = lineno col = mo.start()-line_start matched = True loc = mo.end() match_text = mo.group() if tid == 'ignore': break elif tid == 'lparen': par_count += 1 elif tid == 'rparen': par_count -= 1 elif tid == 'lbracket': bracket_count += 1 elif tid == 'rbracket': bracket_count -= 1 elif tid == 'multiline_string': lines = match_text.split('\n') if len(lines) > 1: lineno += len(lines) - 1 line_start = mo.end() - len(lines[-1]) elif tid == 'eol' or tid == 'eol_cont': lineno += 1 line_start = loc if par_count > 0 or bracket_count > 0: break yield Token(tid, curline, col) if not matched: raise ParseException(lineno, col) if __name__ == '__main__': code = open(sys.argv[1]).read() lex = Lexer() try: for i in lex.lex(code): print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno) except ParseException as e: print('Error line', e.lineno, 'column', e.colno)