Complete python escape sequences aware strings

Fixes #3169
author: Tim 'mithro' Ansell <me@mith.ro> 2018-03-03 15:00:55 -0800
committer: Nirbheek Chauhan <nirbheek.chauhan@gmail.com> 2018-04-17 09:55:34 +0000
commit: 36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8 (patch)
tree: 314bc7cc0306163447aa53022f19943812d40683 /mesonbuild/mparser.py
parent: 6089631a1b9cd5b9b4e75598721b53b9abc33950 (diff)
download: meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.zip
meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.tar.gz
meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.tar.bz2
1 files changed, 30 insertions, 4 deletions
diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py
index 0e7524c..bf7c271 100644
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@@ -13,9 +13,36 @@
 # limitations under the License.
 
 import re
+import codecs
 from .mesonlib import MesonException
 from . import mlog
 
+# This is the regex for the supported escape sequences of a regular string
+# literal, like 'abc\x00'
+ESCAPE_SEQUENCE_SINGLE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'abfnrtv]   # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+# This is the regex for the supported escape sequences of a multiline string
+# literal, like '''abc\x00'''. The only difference is that single quote (')
+# doesn't require escaping.
+ESCAPE_SEQUENCE_MULTI_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\abfnrtv]    # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+def decode_match(match):
+    return codecs.decode(match.group(0), 'unicode_escape')
+
 class ParseException(MesonException):
     def __init__(self, text, line, lineno, colno):
         # Format as error message, followed by the line with the error, followed by a caret to show the error column.
@@ -112,7 +139,6 @@ class Lexer:
         par_count = 0
         bracket_count = 0
         col = 0
-        newline_rx = re.compile(r'(?<!\\)((?:\\\\)*)\\n')
         while loc < len(self.code):
             matched = False
             value = None
@@ -145,12 +171,12 @@ class Lexer:
                         if match_text.find("\n") != -1:
                             mlog.warning("""Newline character in a string detected, use ''' (three single quotes) for multiline strings instead.
 This will become a hard error in a future Meson release.""", self.getline(line_start), lineno, col)
-                        value = match_text[1:-1].replace(r"\'", "'")
-                        value = newline_rx.sub(r'\1\n', value)
-                        value = value.replace(r" \\ ".strip(), r" \ ".strip())
+                        value = match_text[1:-1]
+                        value = ESCAPE_SEQUENCE_SINGLE_RE.sub(decode_match, value)
                     elif tid == 'multiline_string':
                         tid = 'string'
                         value = match_text[3:-3]
+                        value = ESCAPE_SEQUENCE_MULTI_RE.sub(decode_match, value)
                         lines = match_text.split('\n')
                         if len(lines) > 1:
                             lineno += len(lines) - 1
author	Tim 'mithro' Ansell <me@mith.ro>	2018-03-03 15:00:55 -0800
committer	Nirbheek Chauhan <nirbheek.chauhan@gmail.com>	2018-04-17 09:55:34 +0000
commit	36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8 (patch)
tree	314bc7cc0306163447aa53022f19943812d40683 /mesonbuild/mparser.py
parent	6089631a1b9cd5b9b4e75598721b53b9abc33950 (diff)
download	meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.zip meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.tar.gz meson-36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8.tar.bz2