aboutsummaryrefslogtreecommitdiff
path: root/gcc/d/dmd/lexer.d
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/d/dmd/lexer.d')
-rw-r--r--gcc/d/dmd/lexer.d3273
1 files changed, 3273 insertions, 0 deletions
diff --git a/gcc/d/dmd/lexer.d b/gcc/d/dmd/lexer.d
new file mode 100644
index 0000000..afffc2d
--- /dev/null
+++ b/gcc/d/dmd/lexer.d
@@ -0,0 +1,3273 @@
+/**
+ * Implements the lexical analyzer, which converts source code into lexical tokens.
+ *
+ * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
+ *
+ * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
+ * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright)
+ * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+ * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
+ * Documentation: https://dlang.org/phobos/dmd_lexer.html
+ * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
+ */
+
+module dmd.lexer;
+
+import core.stdc.ctype;
+import core.stdc.errno;
+import core.stdc.stdarg;
+import core.stdc.stdio;
+import core.stdc.stdlib : getenv;
+import core.stdc.string;
+import core.stdc.time;
+
+import dmd.entity;
+import dmd.errors;
+import dmd.globals;
+import dmd.id;
+import dmd.identifier;
+import dmd.root.ctfloat;
+import dmd.root.outbuffer;
+import dmd.root.port;
+import dmd.root.rmem;
+import dmd.root.string;
+import dmd.tokens;
+import dmd.utf;
+import dmd.utils;
+
+nothrow:
+
+private enum LS = 0x2028; // UTF line separator
+private enum PS = 0x2029; // UTF paragraph separator
+
+/********************************************
+ * Do our own char maps
+ */
+private static immutable cmtable = () {
+ ubyte[256] table;
+ foreach (const c; 0 .. table.length)
+ {
+ if ('0' <= c && c <= '7')
+ table[c] |= CMoctal;
+ if (c_isxdigit(c))
+ table[c] |= CMhex;
+ if (c_isalnum(c) || c == '_')
+ table[c] |= CMidchar;
+
+ switch (c)
+ {
+ case 'x': case 'X':
+ case 'b': case 'B':
+ table[c] |= CMzerosecond;
+ break;
+
+ case '0': .. case '9':
+ case 'e': case 'E':
+ case 'f': case 'F':
+ case 'l': case 'L':
+ case 'p': case 'P':
+ case 'u': case 'U':
+ case 'i':
+ case '.':
+ case '_':
+ table[c] |= CMzerosecond | CMdigitsecond;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (c)
+ {
+ case '\\':
+ case '\n':
+ case '\r':
+ case 0:
+ case 0x1A:
+ case '\'':
+ break;
+ default:
+ if (!(c & 0x80))
+ table[c] |= CMsinglechar;
+ break;
+ }
+ }
+ return table;
+}();
+
+private
+{
+ enum CMoctal = 0x1;
+ enum CMhex = 0x2;
+ enum CMidchar = 0x4;
+ enum CMzerosecond = 0x8;
+ enum CMdigitsecond = 0x10;
+ enum CMsinglechar = 0x20;
+}
+
+private bool isoctal(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMoctal) != 0;
+}
+
+private bool ishex(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMhex) != 0;
+}
+
+private bool isidchar(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMidchar) != 0;
+}
+
+private bool isZeroSecond(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMzerosecond) != 0;
+}
+
+private bool isDigitSecond(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMdigitsecond) != 0;
+}
+
+private bool issinglechar(const char c) pure @nogc @safe
+{
+ return (cmtable[c] & CMsinglechar) != 0;
+}
+
+private bool c_isxdigit(const int c) pure @nogc @safe
+{
+ return (( c >= '0' && c <= '9') ||
+ ( c >= 'a' && c <= 'f') ||
+ ( c >= 'A' && c <= 'F'));
+}
+
+private bool c_isalnum(const int c) pure @nogc @safe
+{
+ return (( c >= '0' && c <= '9') ||
+ ( c >= 'a' && c <= 'z') ||
+ ( c >= 'A' && c <= 'Z'));
+}
+
+unittest
+{
+ //printf("lexer.unittest\n");
+ /* Not much here, just trying things out.
+ */
+ string text = "int"; // We rely on the implicit null-terminator
+ scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
+ TOK tok;
+ tok = lex1.nextToken();
+ //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
+ assert(tok == TOK.int32);
+ tok = lex1.nextToken();
+ assert(tok == TOK.endOfFile);
+ tok = lex1.nextToken();
+ assert(tok == TOK.endOfFile);
+ tok = lex1.nextToken();
+ assert(tok == TOK.endOfFile);
+}
+
+unittest
+{
+ // We don't want to see Lexer error output during these tests.
+ uint errors = global.startGagging();
+ scope(exit) global.endGagging(errors);
+
+ // Test malformed input: even malformed input should end in a TOK.endOfFile.
+ static immutable char[][] testcases =
+ [ // Testcase must end with 0 or 0x1A.
+ [0], // not malformed, but pathological
+ ['\'', 0],
+ ['\'', 0x1A],
+ ['{', '{', 'q', '{', 0],
+ [0xFF, 0],
+ [0xFF, 0x80, 0],
+ [0xFF, 0xFF, 0],
+ [0xFF, 0xFF, 0],
+ ['x', '"', 0x1A],
+ ];
+
+ foreach (testcase; testcases)
+ {
+ scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
+ TOK tok = lex2.nextToken();
+ size_t iterations = 1;
+ while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
+ {
+ tok = lex2.nextToken();
+ }
+ assert(tok == TOK.endOfFile);
+ tok = lex2.nextToken();
+ assert(tok == TOK.endOfFile);
+ }
+}
+
+version (DMDLIB)
+{
+ version = LocOffset;
+}
+
+/***********************************************************
+ */
+class Lexer
+{
+ private __gshared OutBuffer stringbuffer;
+
+ Loc scanloc; // for error messages
+ Loc prevloc; // location of token before current
+
+ const(char)* p; // current character
+
+ Token token;
+
+ // For ImportC
+ bool Ccompile; /// true if compiling ImportC
+
+ // The following are valid only if (Ccompile == true)
+ ubyte longsize; /// size of C long, 4 or 8
+ ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
+ ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
+
+ private
+ {
+ const(char)* base; // pointer to start of buffer
+ const(char)* end; // pointer to last element of buffer
+ const(char)* line; // start of current line
+
+ bool doDocComment; // collect doc comment information
+ bool anyToken; // seen at least one token
+ bool commentToken; // comments are TOK.comment's
+ int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
+ int lastDocLine; // last line of previous doc comment
+
+ Token* tokenFreelist;
+ }
+
+ nothrow:
+
+ /*********************
+ * Creates a Lexer for the source code base[begoffset..endoffset+1].
+ * The last character, base[endoffset], must be null (0) or EOF (0x1A).
+ *
+ * Params:
+ * filename = used for error messages
+ * base = source code, must be terminated by a null (0) or EOF (0x1A) character
+ * begoffset = starting offset into base[]
+ * endoffset = the last offset to read into base[]
+ * doDocComment = handle documentation comments
+ * commentToken = comments become TOK.comment's
+ */
+ this(const(char)* filename, const(char)* base, size_t begoffset,
+ size_t endoffset, bool doDocComment, bool commentToken) pure
+ {
+ scanloc = Loc(filename, 1, 1);
+ //printf("Lexer::Lexer(%p,%d)\n",base,length);
+ //printf("lexer.filename = %s\n", filename);
+ token = Token.init;
+ this.base = base;
+ this.end = base + endoffset;
+ p = base + begoffset;
+ line = p;
+ this.doDocComment = doDocComment;
+ this.commentToken = commentToken;
+ this.inTokenStringConstant = 0;
+ this.lastDocLine = 0;
+ //initKeywords();
+ /* If first line starts with '#!', ignore the line
+ */
+ if (p && p[0] == '#' && p[1] == '!')
+ {
+ p += 2;
+ while (1)
+ {
+ char c = *p++;
+ switch (c)
+ {
+ case 0:
+ case 0x1A:
+ p--;
+ goto case;
+ case '\n':
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+ endOfLine();
+ }
+ }
+
+ /// Returns: a newly allocated `Token`.
+ Token* allocateToken() pure nothrow @safe
+ {
+ if (tokenFreelist)
+ {
+ Token* t = tokenFreelist;
+ tokenFreelist = t.next;
+ t.next = null;
+ return t;
+ }
+ return new Token();
+ }
+
+ /// Frees the given token by returning it to the freelist.
+ private void releaseToken(Token* token) pure nothrow @nogc @safe
+ {
+ if (mem.isGCEnabled)
+ *token = Token.init;
+ token.next = tokenFreelist;
+ tokenFreelist = token;
+ }
+
+ final TOK nextToken()
+ {
+ prevloc = token.loc;
+ if (token.next)
+ {
+ Token* t = token.next;
+ memcpy(&token, t, Token.sizeof);
+ releaseToken(t);
+ }
+ else
+ {
+ scan(&token);
+ }
+ //printf(token.toChars());
+ return token.value;
+ }
+
+ /***********************
+ * Look ahead at next token's value.
+ */
+ final TOK peekNext()
+ {
+ return peek(&token).value;
+ }
+
+ /***********************
+ * Look 2 tokens ahead at value.
+ */
+ final TOK peekNext2()
+ {
+ Token* t = peek(&token);
+ return peek(t).value;
+ }
+
+ /****************************
+ * Turn next token in buffer into a token.
+ */
+ final void scan(Token* t)
+ {
+ const lastLine = scanloc.linnum;
+ Loc startLoc;
+ t.blockComment = null;
+ t.lineComment = null;
+
+ while (1)
+ {
+ t.ptr = p;
+ //printf("p = %p, *p = '%c'\n",p,*p);
+ t.loc = loc();
+ switch (*p)
+ {
+ case 0:
+ case 0x1A:
+ t.value = TOK.endOfFile; // end of file
+ // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
+ return;
+ case ' ':
+ case '\t':
+ case '\v':
+ case '\f':
+ p++;
+ continue; // skip white space
+ case '\r':
+ p++;
+ if (*p != '\n') // if CR stands by itself
+ {
+ endOfLine();
+ goto skipFourSpaces;
+ }
+ continue; // skip white space
+ case '\n':
+ p++;
+ endOfLine();
+ skipFourSpaces:
+ while (*(cast(uint*)p) == 0x20202020) //' ' == 0x20
+ {
+ p+=4;
+ }
+ continue; // skip white space
+ case '0':
+ if (!isZeroSecond(p[1])) // if numeric literal does not continue
+ {
+ ++p;
+ t.unsvalue = 0;
+ t.value = TOK.int32Literal;
+ return;
+ }
+ goto Lnumber;
+
+ case '1': .. case '9':
+ if (!isDigitSecond(p[1])) // if numeric literal does not continue
+ {
+ t.unsvalue = *p - '0';
+ ++p;
+ t.value = TOK.int32Literal;
+ return;
+ }
+ Lnumber:
+ t.value = number(t);
+ return;
+
+ case '\'':
+ if (issinglechar(p[1]) && p[2] == '\'')
+ {
+ t.unsvalue = p[1]; // simple one character literal
+ t.value = Ccompile ? TOK.int32Literal : TOK.charLiteral;
+ p += 3;
+ }
+ else if (Ccompile)
+ {
+ clexerCharConstant(*t, 0);
+ }
+ else
+ {
+ t.value = charConstant(t);
+ }
+ return;
+
+ case 'u':
+ case 'U':
+ case 'L':
+ if (!Ccompile)
+ goto case_ident;
+ if (p[1] == '\'') // C wide character constant
+ {
+ char c = *p;
+ if (c == 'L') // convert L to u or U
+ c = (wchar_tsize == 4) ? 'u' : 'U';
+ ++p;
+ clexerCharConstant(*t, c);
+ return;
+ }
+ else if (p[1] == '\"') // C wide string literal
+ {
+ const c = *p;
+ ++p;
+ escapeStringConstant(t);
+ t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
+ c == 'u' ? 'w' :
+ 'd';
+ return;
+ }
+ goto case_ident;
+
+ case 'r':
+ if (p[1] != '"')
+ goto case_ident;
+ p++;
+ goto case '`';
+ case '`':
+ wysiwygStringConstant(t);
+ return;
+ case 'x':
+ if (p[1] != '"')
+ goto case_ident;
+ p++;
+ auto start = p;
+ OutBuffer hexString;
+ t.value = hexStringConstant(t);
+ hexString.write(start[0 .. p - start]);
+ error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString.extractChars());
+ return;
+ case 'q':
+ if (p[1] == '"')
+ {
+ p++;
+ delimitedStringConstant(t);
+ return;
+ }
+ else if (p[1] == '{')
+ {
+ p++;
+ tokenStringConstant(t);
+ return;
+ }
+ else
+ goto case_ident;
+ case '"':
+ escapeStringConstant(t);
+ return;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'g':
+ case 'h':
+ case 'i':
+ case 'j':
+ case 'k':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 'o':
+ case 'p':
+ /*case 'q': case 'r':*/
+ case 's':
+ case 't':
+ //case 'u':
+ case 'v':
+ case 'w':
+ /*case 'x':*/
+ case 'y':
+ case 'z':
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ case 'G':
+ case 'H':
+ case 'I':
+ case 'J':
+ case 'K':
+ //case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'Q':
+ case 'R':
+ case 'S':
+ case 'T':
+ //case 'U':
+ case 'V':
+ case 'W':
+ case 'X':
+ case 'Y':
+ case 'Z':
+ case '_':
+ case_ident:
+ {
+ while (1)
+ {
+ const c = *++p;
+ if (isidchar(c))
+ continue;
+ else if (c & 0x80)
+ {
+ const s = p;
+ const u = decodeUTF();
+ if (isUniAlpha(u))
+ continue;
+ error("char 0x%04x not allowed in identifier", u);
+ p = s;
+ }
+ break;
+ }
+ Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
+ t.ident = id;
+ t.value = cast(TOK)id.getValue();
+
+ anyToken = 1;
+
+ /* Different keywords for C and D
+ */
+ if (Ccompile)
+ {
+ if (t.value != TOK.identifier)
+ {
+ t.value = Ckeywords[t.value]; // filter out D keywords
+ }
+ }
+ else if (t.value >= FirstCKeyword)
+ t.value = TOK.identifier; // filter out C keywords
+
+ else if (*t.ptr == '_') // if special identifier token
+ {
+ // Lazy initialization
+ TimeStampInfo.initialize(t.loc);
+
+ if (id == Id.DATE)
+ {
+ t.ustring = TimeStampInfo.date.ptr;
+ goto Lstr;
+ }
+ else if (id == Id.TIME)
+ {
+ t.ustring = TimeStampInfo.time.ptr;
+ goto Lstr;
+ }
+ else if (id == Id.VENDOR)
+ {
+ t.ustring = global.vendor.xarraydup.ptr;
+ goto Lstr;
+ }
+ else if (id == Id.TIMESTAMP)
+ {
+ t.ustring = TimeStampInfo.timestamp.ptr;
+ Lstr:
+ t.value = TOK.string_;
+ t.postfix = 0;
+ t.len = cast(uint)strlen(t.ustring);
+ }
+ else if (id == Id.VERSIONX)
+ {
+ t.value = TOK.int64Literal;
+ t.unsvalue = global.versionNumber();
+ }
+ else if (id == Id.EOFX)
+ {
+ t.value = TOK.endOfFile;
+ // Advance scanner to end of file
+ while (!(*p == 0 || *p == 0x1A))
+ p++;
+ }
+ }
+ //printf("t.value = %d\n",t.value);
+ return;
+ }
+ case '/':
+ p++;
+ switch (*p)
+ {
+ case '=':
+ p++;
+ t.value = TOK.divAssign;
+ return;
+ case '*':
+ p++;
+ startLoc = loc();
+ while (1)
+ {
+ while (1)
+ {
+ const c = *p;
+ switch (c)
+ {
+ case '/':
+ break;
+ case '\n':
+ endOfLine();
+ p++;
+ continue;
+ case '\r':
+ p++;
+ if (*p != '\n')
+ endOfLine();
+ continue;
+ case 0:
+ case 0x1A:
+ error("unterminated /* */ comment");
+ p = end;
+ t.loc = loc();
+ t.value = TOK.endOfFile;
+ return;
+ default:
+ if (c & 0x80)
+ {
+ const u = decodeUTF();
+ if (u == PS || u == LS)
+ endOfLine();
+ }
+ p++;
+ continue;
+ }
+ break;
+ }
+ p++;
+ if (p[-2] == '*' && p - 3 != t.ptr)
+ break;
+ }
+ if (commentToken)
+ {
+ t.loc = startLoc;
+ t.value = TOK.comment;
+ return;
+ }
+ else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
+ {
+ // if /** but not /**/
+ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
+ lastDocLine = scanloc.linnum;
+ }
+ continue;
+ case '/': // do // style comments
+ startLoc = loc();
+ while (1)
+ {
+ const c = *++p;
+ switch (c)
+ {
+ case '\n':
+ break;
+ case '\r':
+ if (p[1] == '\n')
+ p++;
+ break;
+ case 0:
+ case 0x1A:
+ if (commentToken)
+ {
+ p = end;
+ t.loc = startLoc;
+ t.value = TOK.comment;
+ return;
+ }
+ if (doDocComment && t.ptr[2] == '/')
+ {
+ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
+ lastDocLine = scanloc.linnum;
+ }
+ p = end;
+ t.loc = loc();
+ t.value = TOK.endOfFile;
+ return;
+ default:
+ if (c & 0x80)
+ {
+ const u = decodeUTF();
+ if (u == PS || u == LS)
+ break;
+ }
+ continue;
+ }
+ break;
+ }
+ if (commentToken)
+ {
+ p++;
+ endOfLine();
+ t.loc = startLoc;
+ t.value = TOK.comment;
+ return;
+ }
+ if (doDocComment && t.ptr[2] == '/')
+ {
+ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
+ lastDocLine = scanloc.linnum;
+ }
+ p++;
+ endOfLine();
+ continue;
+ case '+':
+ {
+ int nest;
+ startLoc = loc();
+ p++;
+ nest = 1;
+ while (1)
+ {
+ char c = *p;
+ switch (c)
+ {
+ case '/':
+ p++;
+ if (*p == '+')
+ {
+ p++;
+ nest++;
+ }
+ continue;
+ case '+':
+ p++;
+ if (*p == '/')
+ {
+ p++;
+ if (--nest == 0)
+ break;
+ }
+ continue;
+ case '\r':
+ p++;
+ if (*p != '\n')
+ endOfLine();
+ continue;
+ case '\n':
+ endOfLine();
+ p++;
+ continue;
+ case 0:
+ case 0x1A:
+ error("unterminated /+ +/ comment");
+ p = end;
+ t.loc = loc();
+ t.value = TOK.endOfFile;
+ return;
+ default:
+ if (c & 0x80)
+ {
+ uint u = decodeUTF();
+ if (u == PS || u == LS)
+ endOfLine();
+ }
+ p++;
+ continue;
+ }
+ break;
+ }
+ if (commentToken)
+ {
+ t.loc = startLoc;
+ t.value = TOK.comment;
+ return;
+ }
+ if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
+ {
+ // if /++ but not /++/
+ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
+ lastDocLine = scanloc.linnum;
+ }
+ continue;
+ }
+ default:
+ break;
+ }
+ t.value = TOK.div;
+ return;
+ case '.':
+ p++;
+ if (isdigit(*p))
+ {
+ /* Note that we don't allow ._1 and ._ as being
+ * valid floating point numbers.
+ */
+ p--;
+ t.value = inreal(t);
+ }
+ else if (p[0] == '.')
+ {
+ if (p[1] == '.')
+ {
+ p += 2;
+ t.value = TOK.dotDotDot;
+ }
+ else
+ {
+ p++;
+ t.value = TOK.slice;
+ }
+ }
+ else
+ t.value = TOK.dot;
+ return;
+ case '&':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.andAssign;
+ }
+ else if (*p == '&')
+ {
+ p++;
+ t.value = TOK.andAnd;
+ }
+ else
+ t.value = TOK.and;
+ return;
+ case '|':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.orAssign;
+ }
+ else if (*p == '|')
+ {
+ p++;
+ t.value = TOK.orOr;
+ }
+ else
+ t.value = TOK.or;
+ return;
+ case '-':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.minAssign;
+ }
+ else if (*p == '-')
+ {
+ p++;
+ t.value = TOK.minusMinus;
+ }
+ else if (*p == '>')
+ {
+ ++p;
+ t.value = TOK.arrow;
+ }
+ else
+ t.value = TOK.min;
+ return;
+ case '+':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.addAssign;
+ }
+ else if (*p == '+')
+ {
+ p++;
+ t.value = TOK.plusPlus;
+ }
+ else
+ t.value = TOK.add;
+ return;
+ case '<':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.lessOrEqual; // <=
+ }
+ else if (*p == '<')
+ {
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.leftShiftAssign; // <<=
+ }
+ else
+ t.value = TOK.leftShift; // <<
+ }
+ else if (*p == ':' && Ccompile)
+ {
+ ++p;
+ t.value = TOK.leftBracket; // <:
+ }
+ else if (*p == '%' && Ccompile)
+ {
+ ++p;
+ t.value = TOK.leftCurly; // <%
+ }
+ else
+ t.value = TOK.lessThan; // <
+ return;
+ case '>':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.greaterOrEqual; // >=
+ }
+ else if (*p == '>')
+ {
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.rightShiftAssign; // >>=
+ }
+ else if (*p == '>')
+ {
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.unsignedRightShiftAssign; // >>>=
+ }
+ else
+ t.value = TOK.unsignedRightShift; // >>>
+ }
+ else
+ t.value = TOK.rightShift; // >>
+ }
+ else
+ t.value = TOK.greaterThan; // >
+ return;
+ case '!':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.notEqual; // !=
+ }
+ else
+ t.value = TOK.not; // !
+ return;
+ case '=':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.equal; // ==
+ }
+ else if (*p == '>')
+ {
+ p++;
+ t.value = TOK.goesTo; // =>
+ }
+ else
+ t.value = TOK.assign; // =
+ return;
+ case '~':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.concatenateAssign; // ~=
+ }
+ else
+ t.value = TOK.tilde; // ~
+ return;
+ case '^':
+ p++;
+ if (*p == '^')
+ {
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.powAssign; // ^^=
+ }
+ else
+ t.value = TOK.pow; // ^^
+ }
+ else if (*p == '=')
+ {
+ p++;
+ t.value = TOK.xorAssign; // ^=
+ }
+ else
+ t.value = TOK.xor; // ^
+ return;
+ case '(':
+ p++;
+ t.value = TOK.leftParenthesis;
+ return;
+ case ')':
+ p++;
+ t.value = TOK.rightParenthesis;
+ return;
+ case '[':
+ p++;
+ t.value = TOK.leftBracket;
+ return;
+ case ']':
+ p++;
+ t.value = TOK.rightBracket;
+ return;
+ case '{':
+ p++;
+ t.value = TOK.leftCurly;
+ return;
+ case '}':
+ p++;
+ t.value = TOK.rightCurly;
+ return;
+ case '?':
+ p++;
+ t.value = TOK.question;
+ return;
+ case ',':
+ p++;
+ t.value = TOK.comma;
+ return;
+ case ';':
+ p++;
+ t.value = TOK.semicolon;
+ return;
+ case ':':
+ p++;
+ if (*p == ':')
+ {
+ ++p;
+ t.value = TOK.colonColon;
+ }
+ else if (*p == '>' && Ccompile)
+ {
+ ++p;
+ t.value = TOK.rightBracket;
+ }
+ else
+ t.value = TOK.colon;
+ return;
+ case '$':
+ p++;
+ t.value = TOK.dollar;
+ return;
+ case '@':
+ p++;
+ t.value = TOK.at;
+ return;
+ case '*':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.mulAssign;
+ }
+ else
+ t.value = TOK.mul;
+ return;
+ case '%':
+ p++;
+ if (*p == '=')
+ {
+ p++;
+ t.value = TOK.modAssign;
+ }
+ else if (*p == '>' && Ccompile)
+ {
+ ++p;
+ t.value = TOK.rightCurly;
+ }
+ else if (*p == ':' && Ccompile)
+ {
+ goto case '#'; // %: means #
+ }
+ else
+ t.value = TOK.mod;
+ return;
+ case '#':
+ {
+ p++;
+ Token n;
+ scan(&n);
+ if (Ccompile && n.value == TOK.int32Literal)
+ {
+ poundLine(n, true);
+ continue;
+ }
+ if (n.value == TOK.identifier)
+ {
+ if (n.ident == Id.line)
+ {
+ poundLine(n, false);
+ continue;
+ }
+ else
+ {
+ const locx = loc();
+ warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
+ }
+ }
+ else if (n.value == TOK.if_)
+ {
+ error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
+ }
+ t.value = TOK.pound;
+ return;
+ }
+ default:
+ {
+ dchar c = *p;
+ if (c & 0x80)
+ {
+ c = decodeUTF();
+ // Check for start of unicode identifier
+ if (isUniAlpha(c))
+ goto case_ident;
+ if (c == PS || c == LS)
+ {
+ endOfLine();
+ p++;
+ continue;
+ }
+ }
+ if (c < 0x80 && isprint(c))
+ error("character '%c' is not a valid token", c);
+ else
+ error("character 0x%02x is not a valid token", c);
+ p++;
+ continue;
+ }
+ }
+ }
+ }
+
+ final Token* peek(Token* ct)
+ {
+ Token* t;
+ if (ct.next)
+ t = ct.next;
+ else
+ {
+ t = allocateToken();
+ scan(t);
+ ct.next = t;
+ }
+ return t;
+ }
+
+ /*********************************
+ * tk is on the opening (.
+ * Look ahead and return token that is past the closing ).
+ */
+ final Token* peekPastParen(Token* tk)
+ {
+ //printf("peekPastParen()\n");
+ int parens = 1;
+ int curlynest = 0;
+ while (1)
+ {
+ tk = peek(tk);
+ //tk.print();
+ switch (tk.value)
+ {
+ case TOK.leftParenthesis:
+ parens++;
+ continue;
+ case TOK.rightParenthesis:
+ --parens;
+ if (parens)
+ continue;
+ tk = peek(tk);
+ break;
+ case TOK.leftCurly:
+ curlynest++;
+ continue;
+ case TOK.rightCurly:
+ if (--curlynest >= 0)
+ continue;
+ break;
+ case TOK.semicolon:
+ if (curlynest)
+ continue;
+ break;
+ case TOK.endOfFile:
+ break;
+ default:
+ continue;
+ }
+ return tk;
+ }
+ }
+
+ /*******************************************
+ * Parse escape sequence.
+ */
+ private uint escapeSequence()
+ {
+ return Lexer.escapeSequence(token.loc, p, Ccompile);
+ }
+
+ /********
+ * Parse the given string literal escape sequence into a single character.
+ * D https://dlang.org/spec/lex.html#escape_sequences
+ * C11 6.4.4.4
+ * Params:
+ * loc = location to use for error messages
+ * sequence = pointer to string with escape sequence to parse. Updated to
+ * point past the end of the escape sequence
+ * Ccompile = true for compile C11 escape sequences
+ * Returns:
+ * the escape sequence as a single character
+ */
+ private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
+ {
+ const(char)* p = sequence; // cache sequence reference on stack
+ scope(exit) sequence = p;
+
+ uint c = *p;
+ int ndigits;
+ switch (c)
+ {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\':
+ Lconsume:
+ p++;
+ break;
+ case 'a':
+ c = 7;
+ goto Lconsume;
+ case 'b':
+ c = 8;
+ goto Lconsume;
+ case 'f':
+ c = 12;
+ goto Lconsume;
+ case 'n':
+ c = 10;
+ goto Lconsume;
+ case 'r':
+ c = 13;
+ goto Lconsume;
+ case 't':
+ c = 9;
+ goto Lconsume;
+ case 'v':
+ c = 11;
+ goto Lconsume;
+ case 'u':
+ ndigits = 4;
+ goto Lhex;
+ case 'U':
+ ndigits = 8;
+ goto Lhex;
+ case 'x':
+ ndigits = 2;
+ Lhex:
+ p++;
+ c = *p;
+ if (ishex(cast(char)c))
+ {
+ uint v = 0;
+ int n = 0;
+ while (1)
+ {
+ if (isdigit(cast(char)c))
+ c -= '0';
+ else if (islower(c))
+ c -= 'a' - 10;
+ else
+ c -= 'A' - 10;
+ v = v * 16 + c;
+ c = *++p;
+ if (++n == ndigits)
+ break;
+ if (!ishex(cast(char)c))
+ {
+ .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
+ break;
+ }
+ }
+ if (ndigits != 2 && !utf_isValidDchar(v))
+ {
+ .error(loc, "invalid UTF character \\U%08x", v);
+ v = '?'; // recover with valid UTF character
+ }
+ c = v;
+ }
+ else
+ {
+ .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
+ p++;
+ }
+ break;
+ case '&':
+ if (Ccompile)
+ goto default;
+
+ // named character entity
+ for (const idstart = ++p; 1; p++)
+ {
+ switch (*p)
+ {
+ case ';':
+ c = HtmlNamedEntity(idstart, p - idstart);
+ if (c == ~0)
+ {
+ .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
+ c = '?';
+ }
+ p++;
+ break;
+ default:
+ if (isalpha(*p) || (p != idstart && isdigit(*p)))
+ continue;
+ .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
+ c = '?';
+ break;
+ }
+ break;
+ }
+ break;
+ case 0:
+ case 0x1A:
+ // end of file
+ c = '\\';
+ break;
+ default:
+ if (isoctal(cast(char)c))
+ {
+ uint v = 0;
+ int n = 0;
+ do
+ {
+ v = v * 8 + (c - '0');
+ c = *++p;
+ }
+ while (++n < 3 && isoctal(cast(char)c));
+ c = v;
+ if (c > 0xFF)
+ .error(loc, "escape octal sequence \\%03o is larger than \\377", c);
+ }
+ else
+ {
+ .error(loc, "undefined escape sequence \\%c", c);
+ p++;
+ }
+ break;
+ }
+ return c;
+ }
+
+ /**
+ Lex a wysiwyg string. `p` must be pointing to the first character before the
+ contents of the string literal. The character pointed to by `p` will be used as
+ the terminating character (i.e. backtick or double-quote).
+ Params:
+ result = pointer to the token that accepts the result
+ */
+ private void wysiwygStringConstant(Token* result)
+ {
+ result.value = TOK.string_;
+ Loc start = loc();
+ auto terminator = p[0];
+ p++;
+ stringbuffer.setsize(0);
+ while (1)
+ {
+ dchar c = p[0];
+ p++;
+ switch (c)
+ {
+ case '\n':
+ endOfLine();
+ break;
+ case '\r':
+ if (p[0] == '\n')
+ continue; // ignore
+ c = '\n'; // treat EndOfLine as \n character
+ endOfLine();
+ break;
+ case 0:
+ case 0x1A:
+ error("unterminated string constant starting at %s", start.toChars());
+ result.setString();
+ // rewind `p` so it points to the EOF character
+ p--;
+ return;
+ default:
+ if (c == terminator)
+ {
+ result.setString(stringbuffer);
+ stringPostfix(result);
+ return;
+ }
+ else if (c & 0x80)
+ {
+ p--;
+ const u = decodeUTF();
+ p++;
+ if (u == PS || u == LS)
+ endOfLine();
+ stringbuffer.writeUTF8(u);
+ continue;
+ }
+ break;
+ }
+ stringbuffer.writeByte(c);
+ }
+ }
+
+ /**************************************
+ * Lex hex strings:
+ * x"0A ae 34FE BD"
+ */
+ private TOK hexStringConstant(Token* t)
+ {
+ Loc start = loc();
+ uint n = 0;
+ uint v = ~0; // dead assignment, needed to suppress warning
+ p++;
+ stringbuffer.setsize(0);
+ while (1)
+ {
+ dchar c = *p++;
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\v':
+ case '\f':
+ continue; // skip white space
+ case '\r':
+ if (*p == '\n')
+ continue; // ignore '\r' if followed by '\n'
+ // Treat isolated '\r' as if it were a '\n'
+ goto case '\n';
+ case '\n':
+ endOfLine();
+ continue;
+ case 0:
+ case 0x1A:
+ error("unterminated string constant starting at %s", start.toChars());
+ t.setString();
+ // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
+ p--;
+ return TOK.hexadecimalString;
+ case '"':
+ if (n & 1)
+ {
+ error("odd number (%d) of hex characters in hex string", n);
+ stringbuffer.writeByte(v);
+ }
+ t.setString(stringbuffer);
+ stringPostfix(t);
+ return TOK.hexadecimalString;
+ default:
+ if (c >= '0' && c <= '9')
+ c -= '0';
+ else if (c >= 'a' && c <= 'f')
+ c -= 'a' - 10;
+ else if (c >= 'A' && c <= 'F')
+ c -= 'A' - 10;
+ else if (c & 0x80)
+ {
+ p--;
+ const u = decodeUTF();
+ p++;
+ if (u == PS || u == LS)
+ endOfLine();
+ else
+ error("non-hex character \\u%04x in hex string", u);
+ }
+ else
+ error("non-hex character '%c' in hex string", c);
+ if (n & 1)
+ {
+ v = (v << 4) | c;
+ stringbuffer.writeByte(v);
+ }
+ else
+ v = c;
+ n++;
+ break;
+ }
+ }
+ assert(0); // see bug 15731
+ }
+
+ /**
+ Lex a delimited string. Some examples of delimited strings are:
+ ---
+ q"(foo(xxx))" // "foo(xxx)"
+ q"[foo$(LPAREN)]" // "foo$(LPAREN)"
+ q"/foo]/" // "foo]"
+ q"HERE
+ foo
+ HERE" // "foo\n"
+ ---
+ It is assumed that `p` points to the opening double-quote '"'.
+ Params:
+ result = pointer to the token that accepts the result
+ */
+ private void delimitedStringConstant(Token* result)
+ {
+ result.value = TOK.string_;
+ Loc start = loc();
+ dchar delimleft = 0;
+ dchar delimright = 0;
+ uint nest = 1;
+ uint nestcount = ~0; // dead assignment, needed to suppress warning
+ Identifier hereid = null;
+ uint blankrol = 0;
+ uint startline = 0;
+ p++;
+ stringbuffer.setsize(0);
+ while (1)
+ {
+ dchar c = *p++;
+ //printf("c = '%c'\n", c);
+ switch (c)
+ {
+ case '\n':
+ Lnextline:
+ endOfLine();
+ startline = 1;
+ if (blankrol)
+ {
+ blankrol = 0;
+ continue;
+ }
+ if (hereid)
+ {
+ stringbuffer.writeUTF8(c);
+ continue;
+ }
+ break;
+ case '\r':
+ if (*p == '\n')
+ continue; // ignore
+ c = '\n'; // treat EndOfLine as \n character
+ goto Lnextline;
+ case 0:
+ case 0x1A:
+ error("unterminated delimited string constant starting at %s", start.toChars());
+ result.setString();
+ // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
+ p--;
+ return;
+ default:
+ if (c & 0x80)
+ {
+ p--;
+ c = decodeUTF();
+ p++;
+ if (c == PS || c == LS)
+ goto Lnextline;
+ }
+ break;
+ }
+ if (delimleft == 0)
+ {
+ delimleft = c;
+ nest = 1;
+ nestcount = 1;
+ if (c == '(')
+ delimright = ')';
+ else if (c == '{')
+ delimright = '}';
+ else if (c == '[')
+ delimright = ']';
+ else if (c == '<')
+ delimright = '>';
+ else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
+ {
+ // Start of identifier; must be a heredoc
+ Token tok;
+ p--;
+ scan(&tok); // read in heredoc identifier
+ if (tok.value != TOK.identifier)
+ {
+ error("identifier expected for heredoc, not %s", tok.toChars());
+ delimright = c;
+ }
+ else
+ {
+ hereid = tok.ident;
+ //printf("hereid = '%s'\n", hereid.toChars());
+ blankrol = 1;
+ }
+ nest = 0;
+ }
+ else
+ {
+ delimright = c;
+ nest = 0;
+ if (isspace(c))
+ error("delimiter cannot be whitespace");
+ }
+ }
+ else
+ {
+ if (blankrol)
+ {
+ error("heredoc rest of line should be blank");
+ blankrol = 0;
+ continue;
+ }
+ if (nest == 1)
+ {
+ if (c == delimleft)
+ nestcount++;
+ else if (c == delimright)
+ {
+ nestcount--;
+ if (nestcount == 0)
+ goto Ldone;
+ }
+ }
+ else if (c == delimright)
+ goto Ldone;
+ if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
+ {
+ Token tok;
+ auto psave = p;
+ p--;
+ scan(&tok); // read in possible heredoc identifier
+ //printf("endid = '%s'\n", tok.ident.toChars());
+ if (tok.value == TOK.identifier && tok.ident is hereid)
+ {
+ /* should check that rest of line is blank
+ */
+ goto Ldone;
+ }
+ p = psave;
+ }
+ stringbuffer.writeUTF8(c);
+ startline = 0;
+ }
+ }
+ Ldone:
+ if (*p == '"')
+ p++;
+ else if (hereid)
+ error("delimited string must end in %s\"", hereid.toChars());
+ else
+ error("delimited string must end in %c\"", delimright);
+ result.setString(stringbuffer);
+ stringPostfix(result);
+ }
+
+ /**
+ Lex a token string. Some examples of token strings are:
+ ---
+ q{ foo(xxx) } // " foo(xxx) "
+ q{foo$(LPAREN)} // "foo$(LPAREN)"
+ q{{foo}"}"} // "{foo}"}""
+ ---
+ It is assumed that `p` points to the opening curly-brace.
+ Params:
+ result = pointer to the token that accepts the result
+ */
+ private void tokenStringConstant(Token* result)
+ {
+ result.value = TOK.string_;
+
+ uint nest = 1;
+ const start = loc();
+ const pstart = ++p;
+ inTokenStringConstant++;
+ scope(exit) inTokenStringConstant--;
+ while (1)
+ {
+ Token tok;
+ scan(&tok);
+ switch (tok.value)
+ {
+ case TOK.leftCurly:
+ nest++;
+ continue;
+ case TOK.rightCurly:
+ if (--nest == 0)
+ {
+ result.setString(pstart, p - 1 - pstart);
+ stringPostfix(result);
+ return;
+ }
+ continue;
+ case TOK.endOfFile:
+ error("unterminated token string constant starting at %s", start.toChars());
+ result.setString();
+ return;
+ default:
+ continue;
+ }
+ }
+ }
+
+ /**
+ Scan a quoted string while building the processed string value by
+ handling escape sequences. The result is returned in the given `t` token.
+ This function assumes that `p` currently points to the opening quote
+ of the string.
+ Params:
+ t = the token to set the resulting string to
+ * References:
+ * D https://dlang.org/spec/lex.html#double_quoted_strings
+ * ImportC C11 6.4.5
+ */
+ private void escapeStringConstant(Token* t)
+ {
+ t.value = TOK.string_;
+
+ const start = loc();
+ const tc = *p++; // opening quote
+ stringbuffer.setsize(0);
+ while (1)
+ {
+ dchar c = *p++;
+ switch (c)
+ {
+ case '\\':
+ switch (*p)
+ {
+ case '&':
+ if (Ccompile)
+ goto default;
+ goto case;
+
+ case 'u':
+ case 'U':
+ c = escapeSequence();
+ stringbuffer.writeUTF8(c);
+ continue;
+ default:
+ c = escapeSequence();
+ break;
+ }
+ break;
+ case '\n':
+ endOfLine();
+ if (Ccompile)
+ goto Lunterminated;
+ break;
+ case '\r':
+ if (*p == '\n')
+ continue; // ignore
+ c = '\n'; // treat EndOfLine as \n character
+ endOfLine();
+ if (Ccompile)
+ goto Lunterminated;
+ break;
+ case '\'':
+ case '"':
+ if (c != tc)
+ goto default;
+ t.setString(stringbuffer);
+ if (!Ccompile)
+ stringPostfix(t);
+ return;
+ case 0:
+ case 0x1A:
+ // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
+ p--;
+ Lunterminated:
+ error("unterminated string constant starting at %s", start.toChars());
+ t.setString();
+ return;
+ default:
+ if (c & 0x80)
+ {
+ p--;
+ c = decodeUTF();
+ if (c == LS || c == PS)
+ {
+ c = '\n';
+ endOfLine();
+ if (Ccompile)
+ goto Lunterminated;
+ }
+ p++;
+ stringbuffer.writeUTF8(c);
+ continue;
+ }
+ break;
+ }
+ stringbuffer.writeByte(c);
+ }
+ }
+
+ /**************************************
+ * Reference:
+ * https://dlang.org/spec/lex.html#characterliteral
+ */
+ private TOK charConstant(Token* t)
+ {
+ TOK tk = TOK.charLiteral;
+ //printf("Lexer::charConstant\n");
+ p++;
+ dchar c = *p++;
+ switch (c)
+ {
+ case '\\':
+ switch (*p)
+ {
+ case 'u':
+ t.unsvalue = escapeSequence();
+ tk = TOK.wcharLiteral;
+ break;
+ case 'U':
+ case '&':
+ t.unsvalue = escapeSequence();
+ tk = TOK.dcharLiteral;
+ break;
+ default:
+ t.unsvalue = escapeSequence();
+ break;
+ }
+ break;
+ case '\n':
+ L1:
+ endOfLine();
+ goto case;
+ case '\r':
+ goto case '\'';
+ case 0:
+ case 0x1A:
+ // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
+ p--;
+ goto case;
+ case '\'':
+ error("unterminated character constant");
+ t.unsvalue = '?';
+ return tk;
+ default:
+ if (c & 0x80)
+ {
+ p--;
+ c = decodeUTF();
+ p++;
+ if (c == LS || c == PS)
+ goto L1;
+ if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
+ tk = TOK.wcharLiteral;
+ else
+ tk = TOK.dcharLiteral;
+ }
+ t.unsvalue = c;
+ break;
+ }
+ if (*p != '\'')
+ {
+ while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
+ *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
+ {
+ if (*p & 0x80)
+ {
+ const s = p;
+ c = decodeUTF();
+ if (c == LS || c == PS)
+ {
+ p = s;
+ break;
+ }
+ }
+ p++;
+ }
+
+ if (*p == '\'')
+ {
+ error("character constant has multiple characters");
+ p++;
+ }
+ else
+ error("unterminated character constant");
+ t.unsvalue = '?';
+ return tk;
+ }
+ p++;
+ return tk;
+ }
+
+ /***************************************
+ * Lex C character constant.
+ * Parser is on the opening quote.
+ * Params:
+ * t = token to fill in
+ * prefix = one of `u`, `U` or 0.
+ * Reference:
+ * C11 6.4.4.4
+ */
+ private void clexerCharConstant(ref Token t, char prefix)
+ {
+ escapeStringConstant(&t);
+ const(char)[] str = t.ustring[0 .. t.len];
+ const n = str.length;
+ const loc = t.loc;
+ if (n == 0)
+ {
+ error(loc, "empty character constant");
+ t.value = TOK.semicolon;
+ return;
+ }
+
+ uint u;
+ switch (prefix)
+ {
+ case 0:
+ if (n == 1) // fast case
+ {
+ u = str[0];
+ }
+ else if (n > 4)
+ error(loc, "max number of chars in character literal is 4, had %d",
+ cast(int)n);
+ else
+ {
+ foreach (i, c; str)
+ (cast(char*)&u)[n - 1 - i] = c;
+ }
+ break;
+
+ case 'u':
+ dchar d1;
+ size_t idx;
+ auto msg = utf_decodeChar(str, idx, d1);
+ dchar d2 = 0;
+ if (idx < n && !msg)
+ msg = utf_decodeChar(str, idx, d2);
+ if (msg)
+ error(loc, "%s", msg);
+ else if (idx < n)
+ error(loc, "max number of chars in 16 bit character literal is 2, had %d",
+ (n + 1) >> 1);
+ else if (d1 > 0x1_0000)
+ error(loc, "%d does not fit in 16 bits", d1);
+ else if (d2 > 0x1_0000)
+ error(loc, "%d does not fit in 16 bits", d2);
+ u = d1;
+ if (d2)
+ u = (d1 << 16) | d2;
+ break;
+
+ case 'U':
+ dchar d;
+ size_t idx;
+ auto msg = utf_decodeChar(str, idx, d);
+ if (msg)
+ error(loc, "%s", msg);
+ else if (idx < n)
+ error(loc, "max number of chars in 32 bit character literal is 1, had %d",
+ (n + 3) >> 2);
+ u = d;
+ break;
+
+ default:
+ assert(0);
+ }
+ t.value = TOK.int32Literal;
+ t.unsvalue = u;
+ }
+
+ /***************************************
+ * Get postfix of string literal.
+ */
+ private void stringPostfix(Token* t) pure @nogc
+ {
+ switch (*p)
+ {
+ case 'c':
+ case 'w':
+ case 'd':
+ t.postfix = *p;
+ p++;
+ break;
+ default:
+ t.postfix = 0;
+ break;
+ }
+ }
+
+ /**************************************
+ * Read in a number.
+ * If it's an integer, store it in tok.TKutok.Vlong.
+ * integers can be decimal, octal or hex
+ * Handle the suffixes U, UL, LU, L, etc.
+ * If it's double, store it in tok.TKutok.Vdouble.
+ * Returns:
+ * TKnum
+ * TKdouble,...
+ */
+ private TOK number(Token* t)
+ {
+ int base = 10;
+ const start = p;
+ uinteger_t n = 0; // unsigned >=64 bit integer type
+ int d;
+ bool err = false;
+ bool overflow = false;
+ bool anyBinaryDigitsNoSingleUS = false;
+ bool anyHexDigitsNoSingleUS = false;
+ dchar c = *p;
+ if (c == '0')
+ {
+ ++p;
+ c = *p;
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ base = 8;
+ break;
+
+ case '8':
+ case '9':
+ if (Ccompile)
+ error("octal digit expected, not `%c`", c);
+ base = 8;
+ break;
+ case 'x':
+ case 'X':
+ ++p;
+ base = 16;
+ break;
+ case 'b':
+ case 'B':
+ if (Ccompile)
+ error("binary constants not allowed");
+ ++p;
+ base = 2;
+ break;
+ case '.':
+ if (p[1] == '.')
+ goto Ldone; // if ".."
+ if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
+ goto Ldone; // if ".identifier" or ".unicode"
+ goto Lreal; // '.' is part of current token
+ case 'i':
+ case 'f':
+ case 'F':
+ goto Lreal;
+ case '_':
+ if (Ccompile)
+ error("embedded `_` not allowed");
+ ++p;
+ base = 8;
+ break;
+ case 'L':
+ if (p[1] == 'i')
+ goto Lreal;
+ break;
+ default:
+ break;
+ }
+ }
+ while (1)
+ {
+ c = *p;
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ ++p;
+ d = c - '0';
+ break;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ ++p;
+ if (base != 16)
+ {
+ if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
+ goto Lreal;
+ }
+ if (c >= 'a')
+ d = c + 10 - 'a';
+ else
+ d = c + 10 - 'A';
+ break;
+ case 'L':
+ if (p[1] == 'i')
+ goto Lreal;
+ goto Ldone;
+ case '.':
+ if (p[1] == '.')
+ goto Ldone; // if ".."
+ if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
+ goto Ldone; // if ".identifier" or ".unicode"
+ if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
+ goto Ldone; // if ".identifier" or ".unicode"
+ if (base == 2)
+ goto Ldone; // if ".identifier" or ".unicode"
+ goto Lreal; // otherwise as part of a floating point literal
+ case 'p':
+ case 'P':
+ case 'i':
+ Lreal:
+ p = start;
+ return inreal(t);
+ case '_':
+ if (Ccompile)
+ goto default;
+ ++p;
+ continue;
+ default:
+ goto Ldone;
+ }
+ // got a digit here, set any necessary flags, check for errors
+ anyHexDigitsNoSingleUS = true;
+ anyBinaryDigitsNoSingleUS = true;
+ if (!err && d >= base)
+ {
+ error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
+ base == 8 ? "octal".ptr :
+ "decimal".ptr, c);
+ err = true;
+ }
+ // Avoid expensive overflow check if we aren't at risk of overflow
+ if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
+ n = n * base + d;
+ else
+ {
+ import core.checkedint : mulu, addu;
+
+ n = mulu(n, base, overflow);
+ n = addu(n, d, overflow);
+ }
+ }
+ Ldone:
+ if (overflow && !err)
+ {
+ error("integer overflow");
+ err = true;
+ }
+ if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
+ (base == 16 && !anyHexDigitsNoSingleUS))
+ error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
+
+ t.unsvalue = n;
+
+ if (Ccompile)
+ return cnumber(base, n);
+
+ enum FLAGS : int
+ {
+ none = 0,
+ decimal = 1, // decimal
+ unsigned = 2, // u or U suffix
+ long_ = 4, // L suffix
+ }
+
+ FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
+ // Parse trailing 'u', 'U', 'l' or 'L' in any combination
+ const psuffix = p;
+ while (1)
+ {
+ FLAGS f;
+ switch (*p)
+ {
+ case 'U':
+ case 'u':
+ f = FLAGS.unsigned;
+ goto L1;
+ case 'l':
+ f = FLAGS.long_;
+ error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
+ goto L1;
+ case 'L':
+ f = FLAGS.long_;
+ L1:
+ p++;
+ if ((flags & f) && !err)
+ {
+ error("unrecognized token");
+ err = true;
+ }
+ flags = cast(FLAGS)(flags | f);
+ continue;
+ default:
+ break;
+ }
+ break;
+ }
+ if (base == 8 && n >= 8)
+ {
+ if (err)
+ // can't translate invalid octal value, just show a generic message
+ error("octal literals larger than 7 are no longer supported");
+ else
+ error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead",
+ n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
+ }
+ TOK result;
+ switch (flags)
+ {
+ case FLAGS.none:
+ /* Octal or Hexadecimal constant.
+ * First that fits: int, uint, long, ulong
+ */
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else if (n & 0xFFFFFFFF00000000L)
+ result = TOK.int64Literal;
+ else if (n & 0x80000000)
+ result = TOK.uns32Literal;
+ else
+ result = TOK.int32Literal;
+ break;
+ case FLAGS.decimal:
+ /* First that fits: int, long, long long
+ */
+ if (n & 0x8000000000000000L)
+ {
+ result = TOK.uns64Literal;
+ }
+ else if (n & 0xFFFFFFFF80000000L)
+ result = TOK.int64Literal;
+ else
+ result = TOK.int32Literal;
+ break;
+ case FLAGS.unsigned:
+ case FLAGS.decimal | FLAGS.unsigned:
+ /* First that fits: uint, ulong
+ */
+ if (n & 0xFFFFFFFF00000000L)
+ result = TOK.uns64Literal;
+ else
+ result = TOK.uns32Literal;
+ break;
+ case FLAGS.decimal | FLAGS.long_:
+ if (n & 0x8000000000000000L)
+ {
+ if (!err)
+ {
+ error("signed integer overflow");
+ err = true;
+ }
+ result = TOK.uns64Literal;
+ }
+ else
+ result = TOK.int64Literal;
+ break;
+ case FLAGS.long_:
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else
+ result = TOK.int64Literal;
+ break;
+ case FLAGS.unsigned | FLAGS.long_:
+ case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
+ result = TOK.uns64Literal;
+ break;
+ default:
+ debug
+ {
+ printf("%x\n", flags);
+ }
+ assert(0);
+ }
+ return result;
+ }
+
+ /**************************************
+ * Lex C integer-suffix
+ * Params:
+ * base = number base
+ * n = raw integer value
+ * Returns:
+ * token value
+ */
+ private TOK cnumber(int base, uinteger_t n)
+ {
+ /* C11 6.4.4.1
+ * Parse trailing suffixes:
+ * u or U
+ * l or L
+ * ll or LL
+ */
+ enum FLAGS : uint
+ {
+ octalhex = 1, // octal or hexadecimal
+ decimal = 2, // decimal
+ unsigned = 4, // u or U suffix
+ long_ = 8, // l or L suffix
+ llong = 0x10 // ll or LL
+ }
+ FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
+ bool err;
+ Lsuffixes:
+ while (1)
+ {
+ FLAGS f;
+ const cs = *p;
+ switch (cs)
+ {
+ case 'U':
+ case 'u':
+ f = FLAGS.unsigned;
+ break;
+
+ case 'l':
+ case 'L':
+ f = FLAGS.long_;
+ if (cs == p[1])
+ {
+ f = FLAGS.long_ | FLAGS.llong;
+ ++p;
+ }
+ break;
+
+ default:
+ break Lsuffixes;
+ }
+ ++p;
+ if ((flags & f) && !err)
+ {
+ error("duplicate integer suffixes");
+ err = true;
+ }
+ flags = cast(FLAGS)(flags | f);
+ }
+
+ void overflow()
+ {
+ error("integer overflow");
+ }
+
+ TOK result = TOK.int32Literal; // default
+ switch (flags)
+ {
+ /* Since D doesn't have a variable sized `long` or `unsigned long` type,
+ * this code deviates from C by picking D int, uint, long, or ulong instead
+ */
+
+ case FLAGS.octalhex:
+ /* Octal or Hexadecimal constant.
+ * First that fits: int, unsigned, long, unsigned long,
+ * long long, unsigned long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else if (n & 0xFFFFFFFF00000000L)
+ result = TOK.int64Literal;
+ else if (n & 0x80000000)
+ result = TOK.uns32Literal;
+ else
+ result = TOK.int32Literal;
+ }
+ else
+ {
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal; // unsigned long
+ else if (n & 0xFFFFFFFF00000000L)
+ result = TOK.int64Literal; // long
+ else if (n & 0x80000000)
+ result = TOK.uns32Literal;
+ else
+ result = TOK.int32Literal;
+ }
+ break;
+
+ case FLAGS.decimal:
+ /* First that fits: int, long, long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else if (n & 0xFFFFFFFF80000000L)
+ result = TOK.int64Literal;
+ else
+ result = TOK.int32Literal;
+ }
+ else
+ {
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal; // unsigned long
+ else if (n & 0xFFFFFFFF80000000L)
+ result = TOK.int64Literal; // long
+ else
+ result = TOK.int32Literal;
+ }
+ break;
+
+ case FLAGS.octalhex | FLAGS.unsigned:
+ case FLAGS.decimal | FLAGS.unsigned:
+ /* First that fits: unsigned, unsigned long, unsigned long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0xFFFFFFFF00000000L)
+ result = TOK.uns64Literal;
+ else
+ result = TOK.uns32Literal;
+ }
+ else
+ {
+ if (n & 0xFFFFFFFF00000000L)
+ result = TOK.uns64Literal; // unsigned long
+ else
+ result = TOK.uns32Literal;
+ }
+ break;
+
+ case FLAGS.decimal | FLAGS.long_:
+ /* First that fits: long, long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0x8000000000000000L)
+ overflow();
+ else if (n & 0xFFFFFFFF_80000000L)
+ result = TOK.int64Literal;
+ else
+ result = TOK.int32Literal; // long
+ }
+ else
+ {
+ if (n & 0x8000000000000000L)
+ overflow();
+ else
+ result = TOK.int64Literal; // long
+ }
+ break;
+
+ case FLAGS.octalhex | FLAGS.long_:
+ /* First that fits: long, unsigned long, long long,
+ * unsigned long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else if (n & 0xFFFFFFFF00000000L)
+ result = TOK.int64Literal;
+ else if (n & 0x80000000)
+ result = TOK.uns32Literal; // unsigned long
+ else
+ result = TOK.int32Literal; // long
+ }
+ else
+ {
+ if (n & 0x80000000_00000000L)
+ result = TOK.uns64Literal; // unsigned long
+ else
+ result = TOK.int64Literal; // long
+ }
+ break;
+
+ case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
+ case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
+ /* First that fits: unsigned long, unsigned long long
+ */
+ if (longsize == 4)
+ {
+ if (n & 0xFFFFFFFF00000000L)
+ result = TOK.uns64Literal;
+ else
+ result = TOK.uns32Literal; // unsigned long
+ }
+ else
+ {
+ result = TOK.uns64Literal; // unsigned long
+ }
+ break;
+
+ case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
+ /* First that fits: long long, unsigned long long
+ */
+ if (n & 0x8000000000000000L)
+ result = TOK.uns64Literal;
+ else
+ result = TOK.int64Literal;
+ break;
+
+ case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
+ /* long long
+ */
+ result = TOK.int64Literal;
+ break;
+
+ case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
+ case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
+ result = TOK.uns64Literal;
+ break;
+
+ default:
+ debug printf("%x\n",flags);
+ assert(0);
+ }
+ return result;
+ }
+
+ /**************************************
+ * Read in characters, converting them to real.
+ * Bugs:
+ * Exponent overflow not detected.
+ * Too much requested precision is not detected.
+ */
+ private TOK inreal(Token* t)
+ {
+ //printf("Lexer::inreal()\n");
+ debug
+ {
+ assert(*p == '.' || isdigit(*p));
+ }
+ bool isWellformedString = true;
+ stringbuffer.setsize(0);
+ auto pstart = p;
+ bool hex = false;
+ dchar c = *p++;
+ // Leading '0x'
+ if (c == '0')
+ {
+ c = *p++;
+ if (c == 'x' || c == 'X')
+ {
+ hex = true;
+ c = *p++;
+ }
+ }
+ // Digits to left of '.'
+ while (1)
+ {
+ if (c == '.')
+ {
+ c = *p++;
+ break;
+ }
+ if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
+ {
+ c = *p++;
+ continue;
+ }
+ break;
+ }
+ // Digits to right of '.'
+ while (1)
+ {
+ if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
+ {
+ c = *p++;
+ continue;
+ }
+ break;
+ }
+ if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
+ {
+ c = *p++;
+ if (c == '-' || c == '+')
+ {
+ c = *p++;
+ }
+ bool anyexp = false;
+ while (1)
+ {
+ if (isdigit(c))
+ {
+ anyexp = true;
+ c = *p++;
+ continue;
+ }
+ if (c == '_')
+ {
+ if (Ccompile)
+ error("embedded `_` in numeric literals not allowed");
+ c = *p++;
+ continue;
+ }
+ if (!anyexp)
+ {
+ error("missing exponent");
+ isWellformedString = false;
+ }
+ break;
+ }
+ }
+ else if (hex)
+ {
+ error("exponent required for hex float");
+ isWellformedString = false;
+ }
+ --p;
+ while (pstart < p)
+ {
+ if (*pstart != '_')
+ stringbuffer.writeByte(*pstart);
+ ++pstart;
+ }
+ stringbuffer.writeByte(0);
+ auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
+ TOK result;
+ bool isOutOfRange = false;
+ t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
+ switch (*p)
+ {
+ case 'F':
+ case 'f':
+ if (isWellformedString && !isOutOfRange)
+ isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
+ result = TOK.float32Literal;
+ p++;
+ break;
+ default:
+ if (isWellformedString && !isOutOfRange)
+ isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
+ result = TOK.float64Literal;
+ break;
+ case 'l':
+ if (!Ccompile)
+ error("use 'L' suffix instead of 'l'");
+ goto case 'L';
+ case 'L':
+ ++p;
+ if (Ccompile && long_doublesize == 8)
+ goto default;
+ result = TOK.float80Literal;
+ break;
+ }
+ if ((*p == 'i' || *p == 'I') && !Ccompile)
+ {
+ if (*p == 'I')
+ error("use 'i' suffix instead of 'I'");
+ p++;
+ switch (result)
+ {
+ case TOK.float32Literal:
+ result = TOK.imaginary32Literal;
+ break;
+ case TOK.float64Literal:
+ result = TOK.imaginary64Literal;
+ break;
+ case TOK.float80Literal:
+ result = TOK.imaginary80Literal;
+ break;
+ default:
+ break;
+ }
+ }
+ const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
+ if (isOutOfRange && !isLong)
+ {
+ const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
+ error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
+ }
+ debug
+ {
+ switch (result)
+ {
+ case TOK.float32Literal:
+ case TOK.float64Literal:
+ case TOK.float80Literal:
+ case TOK.imaginary32Literal:
+ case TOK.imaginary64Literal:
+ case TOK.imaginary80Literal:
+ break;
+ default:
+ assert(0);
+ }
+ }
+ return result;
+ }
+
+ final Loc loc() pure @nogc
+ {
+ scanloc.charnum = cast(uint)(1 + p - line);
+ version (LocOffset)
+ scanloc.fileOffset = cast(uint)(p - base);
+ return scanloc;
+ }
+
+ final void error(const(char)* format, ...)
+ {
+ va_list args;
+ va_start(args, format);
+ .verror(token.loc, format, args);
+ va_end(args);
+ }
+
+ final void error(const ref Loc loc, const(char)* format, ...)
+ {
+ va_list args;
+ va_start(args, format);
+ .verror(loc, format, args);
+ va_end(args);
+ }
+
+ final void deprecation(const(char)* format, ...)
+ {
+ va_list args;
+ va_start(args, format);
+ .vdeprecation(token.loc, format, args);
+ va_end(args);
+ }
+
+ /*********************************************
+ * Parse line/file preprocessor directive:
+ * #line linnum [filespec]
+ * Allow __LINE__ for linnum, and __FILE__ for filespec.
+ * Accept linemarker format:
+ * # linnum [filespec] {flags}
+ * There can be zero or more flags, which are one of the digits 1..4, and
+ * must be in ascending order. The flags are ignored.
+ * Params:
+ * tok = token we're on, which is linnum of linemarker
+ * linemarker = true if line marker format and lexer is on linnum
+ * References:
+ * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
+ */
+ private void poundLine(ref Token tok, bool linemarker)
+ {
+ auto linnum = this.scanloc.linnum;
+ const(char)* filespec = null;
+ const loc = this.loc();
+ bool flags;
+
+ if (!linemarker)
+ scan(&tok);
+ if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
+ {
+ const lin = cast(int)(tok.unsvalue - 1);
+ if (lin != tok.unsvalue - 1)
+ error("line number `%lld` out of range", cast(ulong)tok.unsvalue);
+ else
+ linnum = lin;
+ }
+ else if (tok.value == TOK.line) // #line __LINE__
+ {
+ }
+ else
+ goto Lerr;
+ while (1)
+ {
+ switch (*p)
+ {
+ case 0:
+ case 0x1A:
+ case '\n':
+ Lnewline:
+ if (!inTokenStringConstant)
+ {
+ this.scanloc.linnum = linnum;
+ if (filespec)
+ this.scanloc.filename = filespec;
+ }
+ return;
+ case '\r':
+ p++;
+ if (*p != '\n')
+ {
+ p--;
+ goto Lnewline;
+ }
+ continue;
+ case ' ':
+ case '\t':
+ case '\v':
+ case '\f':
+ p++;
+ continue; // skip white space
+ case '_':
+ if (filespec || flags)
+ goto Lerr;
+ if (memcmp(p, "__FILE__".ptr, 8) == 0)
+ {
+ p += 8;
+ filespec = mem.xstrdup(scanloc.filename);
+ continue;
+ }
+ goto Lerr;
+ case '"':
+ if (filespec || flags)
+ goto Lerr;
+ stringbuffer.setsize(0);
+ p++;
+ while (1)
+ {
+ uint c;
+ c = *p;
+ switch (c)
+ {
+ case '\n':
+ case '\r':
+ case 0:
+ case 0x1A:
+ goto Lerr;
+ case '"':
+ stringbuffer.writeByte(0);
+ filespec = mem.xstrdup(cast(const(char)*)stringbuffer[].ptr);
+ p++;
+ break;
+ default:
+ if (c & 0x80)
+ {
+ uint u = decodeUTF();
+ if (u == PS || u == LS)
+ goto Lerr;
+ }
+ stringbuffer.writeByte(c);
+ p++;
+ continue;
+ }
+ break;
+ }
+ continue;
+
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ flags = true; // linemarker flags seen
+ ++p;
+ if ('0' <= *p && *p <= '9')
+ goto Lerr; // only one digit allowed
+ continue;
+
+ default:
+ if (*p & 0x80)
+ {
+ uint u = decodeUTF();
+ if (u == PS || u == LS)
+ goto Lnewline;
+ }
+ goto Lerr;
+ }
+ }
+ Lerr:
+ if (linemarker)
+ error(loc, "# integer [\"filespec\"] { 1 | 2 | 3 | 4 }\\n expected");
+ else
+ error(loc, "#line integer [\"filespec\"]\\n expected");
+ }
+
+ /********************************************
+ * Decode UTF character.
+ * Issue error messages for invalid sequences.
+ * Return decoded character, advance p to last character in UTF sequence.
+ */
+ private uint decodeUTF()
+ {
+ const s = p;
+ assert(*s & 0x80);
+ // Check length of remaining string up to 4 UTF-8 characters
+ size_t len;
+ for (len = 1; len < 4 && s[len]; len++)
+ {
+ }
+ size_t idx = 0;
+ dchar u;
+ const msg = utf_decodeChar(s[0 .. len], idx, u);
+ p += idx - 1;
+ if (msg)
+ {
+ error("%.*s", cast(int)msg.length, msg.ptr);
+ }
+ return u;
+ }
+
+ /***************************************************
+ * Parse doc comment embedded between t.ptr and p.
+ * Remove trailing blanks and tabs from lines.
+ * Replace all newlines with \n.
+ * Remove leading comment character from each line.
+ * Decide if it's a lineComment or a blockComment.
+ * Append to previous one for this token.
+ *
+ * If newParagraph is true, an extra newline will be
+ * added between adjoining doc comments.
+ */
+ private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
+ {
+ /* ct tells us which kind of comment it is: '/', '*', or '+'
+ */
+ const ct = t.ptr[2];
+ /* Start of comment text skips over / * *, / + +, or / / /
+ */
+ const(char)* q = t.ptr + 3; // start of comment text
+ const(char)* qend = p;
+ if (ct == '*' || ct == '+')
+ qend -= 2;
+ /* Scan over initial row of ****'s or ++++'s or ////'s
+ */
+ for (; q < qend; q++)
+ {
+ if (*q != ct)
+ break;
+ }
+ /* Remove leading spaces until start of the comment
+ */
+ int linestart = 0;
+ if (ct == '/')
+ {
+ while (q < qend && (*q == ' ' || *q == '\t'))
+ ++q;
+ }
+ else if (q < qend)
+ {
+ if (*q == '\r')
+ {
+ ++q;
+ if (q < qend && *q == '\n')
+ ++q;
+ linestart = 1;
+ }
+ else if (*q == '\n')
+ {
+ ++q;
+ linestart = 1;
+ }
+ }
+ /* Remove trailing row of ****'s or ++++'s
+ */
+ if (ct != '/')
+ {
+ for (; q < qend; qend--)
+ {
+ if (qend[-1] != ct)
+ break;
+ }
+ }
+ /* Comment is now [q .. qend].
+ * Canonicalize it into buf[].
+ */
+ OutBuffer buf;
+
+ void trimTrailingWhitespace()
+ {
+ const s = buf[];
+ auto len = s.length;
+ while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
+ --len;
+ buf.setsize(len);
+ }
+
+ for (; q < qend; q++)
+ {
+ char c = *q;
+ switch (c)
+ {
+ case '*':
+ case '+':
+ if (linestart && c == ct)
+ {
+ linestart = 0;
+ /* Trim preceding whitespace up to preceding \n
+ */
+ trimTrailingWhitespace();
+ continue;
+ }
+ break;
+ case ' ':
+ case '\t':
+ break;
+ case '\r':
+ if (q[1] == '\n')
+ continue; // skip the \r
+ goto Lnewline;
+ default:
+ if (c == 226)
+ {
+ // If LS or PS
+ if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
+ {
+ q += 2;
+ goto Lnewline;
+ }
+ }
+ linestart = 0;
+ break;
+ Lnewline:
+ c = '\n'; // replace all newlines with \n
+ goto case;
+ case '\n':
+ linestart = 1;
+ /* Trim trailing whitespace
+ */
+ trimTrailingWhitespace();
+ break;
+ }
+ buf.writeByte(c);
+ }
+ /* Trim trailing whitespace (if the last line does not have newline)
+ */
+ trimTrailingWhitespace();
+
+ // Always end with a newline
+ const s = buf[];
+ if (s.length == 0 || s[$ - 1] != '\n')
+ buf.writeByte('\n');
+
+ // It's a line comment if the start of the doc comment comes
+ // after other non-whitespace on the same line.
+ auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
+ // Combine with previous doc comment, if any
+ if (*dc)
+ *dc = combineComments(*dc, buf[], newParagraph).toDString();
+ else
+ *dc = buf.extractSlice(true);
+ }
+
+ /********************************************
+ * Combine two document comments into one,
+ * separated by an extra newline if newParagraph is true.
+ */
+ static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
+ {
+ //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph);
+ const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
+ if (!c1)
+ return c2.ptr;
+ if (!c2)
+ return c1.ptr;
+
+ int insertNewLine = 0;
+ if (c1.length && c1[$ - 1] != '\n')
+ insertNewLine = 1;
+ const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
+ auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
+ p[0 .. c1.length] = c1[];
+ if (insertNewLine)
+ p[c1.length] = '\n';
+ if (newParagraph)
+ p[c1.length + insertNewLine] = '\n';
+ p[retSize - c2.length .. retSize] = c2[];
+ p[retSize] = 0;
+ return p;
+ }
+
+private:
+ void endOfLine() pure @nogc @safe
+ {
+ scanloc.linnum++;
+ line = p;
+ }
+}
+
+/// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
+private struct TimeStampInfo
+{
+ private __gshared bool initdone = false;
+
+ // Note: Those properties need to be guarded by a call to `init`
+ // The API isn't safe, and quite brittle, but it was left this way
+ // over performance concerns.
+ // This is currently only called once, from the lexer.
+ __gshared char[11 + 1] date;
+ __gshared char[8 + 1] time;
+ __gshared char[24 + 1] timestamp;
+
+ public static void initialize(const ref Loc loc) nothrow
+ {
+ if (initdone)
+ return;
+
+ initdone = true;
+ time_t ct;
+ // https://issues.dlang.org/show_bug.cgi?id=20444
+ if (auto p = getenv("SOURCE_DATE_EPOCH"))
+ {
+ if (!ct.parseDigits(p.toDString()))
+ error(loc, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
+ }
+ else
+ .time(&ct);
+ const p = ctime(&ct);
+ assert(p);
+ sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
+ sprintf(&time[0], "%.8s", p + 11);
+ sprintf(&timestamp[0], "%.24s", p);
+ }
+}
+
+unittest
+{
+ import dmd.console;
+ nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
+ const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
+ {
+ assert(0);
+ }
+ diagnosticHandler = &assertDiagnosticHandler;
+
+ static void test(T)(string sequence, T expected, bool Ccompile = false)
+ {
+ auto p = cast(const(char)*)sequence.ptr;
+ assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
+ assert(p == sequence.ptr + sequence.length);
+ }
+
+ test(`'`, '\'');
+ test(`"`, '"');
+ test(`?`, '?');
+ test(`\`, '\\');
+ test(`0`, '\0');
+ test(`a`, '\a');
+ test(`b`, '\b');
+ test(`f`, '\f');
+ test(`n`, '\n');
+ test(`r`, '\r');
+ test(`t`, '\t');
+ test(`v`, '\v');
+
+ test(`x00`, 0x00);
+ test(`xff`, 0xff);
+ test(`xFF`, 0xff);
+ test(`xa7`, 0xa7);
+ test(`x3c`, 0x3c);
+ test(`xe2`, 0xe2);
+
+ test(`1`, '\1');
+ test(`42`, '\42');
+ test(`357`, '\357');
+
+ test(`u1234`, '\u1234');
+ test(`uf0e4`, '\uf0e4');
+
+ test(`U0001f603`, '\U0001f603');
+
+ test(`&quot;`, '"');
+ test(`&lt;`, '<');
+ test(`&gt;`, '>');
+
+ diagnosticHandler = null;
+}
+unittest
+{
+ import dmd.console;
+ string expected;
+ bool gotError;
+
+ nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
+ const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
+ {
+ assert(cast(Classification)headerColor == Classification.error);
+
+ gotError = true;
+ char[100] buffer = void;
+ auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
+ assert(expected == actual);
+ return true;
+ }
+
+ diagnosticHandler = &expectDiagnosticHandler;
+
+ void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
+ {
+ uint errors = global.errors;
+ gotError = false;
+ expected = expectedError;
+ auto p = cast(const(char)*)sequence.ptr;
+ auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
+ assert(gotError);
+ assert(expectedReturnValue == actualReturnValue);
+
+ auto actualScanLength = p - sequence.ptr;
+ assert(expectedScanLength == actualScanLength);
+ global.errors = errors;
+ }
+
+ test("c", `undefined escape sequence \c`, 'c', 1);
+ test("!", `undefined escape sequence \!`, '!', 1);
+ test("&quot;", `undefined escape sequence \&`, '&', 1, true);
+
+ test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
+
+ test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
+ test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
+ test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
+
+ test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
+ test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
+ test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
+ test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
+ test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
+ test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
+ test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
+
+ test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
+ test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
+ test("U00110000", `invalid UTF character \U00110000`, '?', 9);
+
+ test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
+ test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
+ test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
+
+ test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
+ test("&quot", `unterminated named entity &quot;`, '?', 5);
+ test("&quot", `unterminated named entity &quot;`, '?', 5);
+
+ test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
+
+ diagnosticHandler = null;
+}