consistent invalid escape sequence behaviour

* docs: document unrecognized escape sequence behaviour [skip ci] Document that unrecognized escape sequence behaviour is like python, not C. * Don't try to decode invalid hex escape sequences Don't try to decode escape sequences which should contain a sequence of hex digits, but don't, throwing a python exception. These will treated literally instead. * Extend test case to cover invalid escape sequences
5 years ago · ccc4ce28cc
parent 03d34a79f8
commit ccc4ce28cc
3 changed files with 23 additions and 6 deletions
--- a/docs/markdown/Syntax.md
+++ b/docs/markdown/Syntax.md
@ -116,6 +116,9 @@ The full list of escape sequences is:

 As in python and C, up to three octal digits are accepted in `\ooo`.

+Unrecognized escape sequences are left in the string unchanged, i.e., the
+backslash is left in the string.
+
 #### String concatenation

 Strings can be concatenated to form a new string using the `+` symbol.
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@ -21,12 +21,12 @@ from . import mlog
 # This is the regex for the supported escape sequences of a regular string
 # literal, like 'abc\x00'
 ESCAPE_SEQUENCE_SINGLE_RE = re.compile(r'''
-    ( \\U........      # 8-digit hex escapes
-    | \\u....          # 4-digit hex escapes
-    | \\x..            # 2-digit hex escapes
-    | \\[0-7]{1,3}     # Octal escapes
-    | \\N\{[^}]+\}     # Unicode characters by name
-    | \\[\\'abfnrtv]   # Single-character escapes
+    ( \\U[A-Fa-f0-9]{8}   # 8-digit hex escapes
+    | \\u[A-Fa-f0-9]{4}   # 4-digit hex escapes
+    | \\x[A-Fa-f0-9]{2}   # 2-digit hex escapes
+    | \\[0-7]{1,3}        # Octal escapes
+    | \\N\{[^}]+\}        # Unicode characters by name
+    | \\[\\'abfnrtv]      # Single-character escapes
    )''', re.UNICODE | re.VERBOSE)

 class MesonUnicodeDecodeError(MesonException):
--- a/unicode/meson.build
+++ b/unicode/meson.build
@ -22,3 +22,17 @@ foreach l : find_file_list.stdout().strip('\x00').split('\x00')
 endforeach

 test('second', executable('second', found_files_hex + [gen_file]))
+
+# Unrecognized and malformed escape sequences are literal
+
+malformed = [
+ [ '\c', 'c' ],
+ [ '\Uabcdefghi', 'Uabcdefghi'],
+ [ '\u123 ', 'u123 '],
+ [ '\xqr', 'xqr'],
+]
+
+foreach m : malformed
+  assert(m[0].endswith(m[1]), 'bad escape sequence had unexpected end')
+  assert(m[0].startswith('\\'), 'bad escape sequence had unexpected start')
+endforeach