diff options
Diffstat (limited to 'gcc/d/dmd/lexer.d')
-rw-r--r-- | gcc/d/dmd/lexer.d | 3273 |
1 files changed, 3273 insertions, 0 deletions
diff --git a/gcc/d/dmd/lexer.d b/gcc/d/dmd/lexer.d new file mode 100644 index 0000000..afffc2d --- /dev/null +++ b/gcc/d/dmd/lexer.d @@ -0,0 +1,3273 @@ +/** + * Implements the lexical analyzer, which converts source code into lexical tokens. + * + * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) + * + * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved + * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) + * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) + * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) + * Documentation: https://dlang.org/phobos/dmd_lexer.html + * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d + */ + +module dmd.lexer; + +import core.stdc.ctype; +import core.stdc.errno; +import core.stdc.stdarg; +import core.stdc.stdio; +import core.stdc.stdlib : getenv; +import core.stdc.string; +import core.stdc.time; + +import dmd.entity; +import dmd.errors; +import dmd.globals; +import dmd.id; +import dmd.identifier; +import dmd.root.ctfloat; +import dmd.root.outbuffer; +import dmd.root.port; +import dmd.root.rmem; +import dmd.root.string; +import dmd.tokens; +import dmd.utf; +import dmd.utils; + +nothrow: + +private enum LS = 0x2028; // UTF line separator +private enum PS = 0x2029; // UTF paragraph separator + +/******************************************** + * Do our own char maps + */ +private static immutable cmtable = () { + ubyte[256] table; + foreach (const c; 0 .. table.length) + { + if ('0' <= c && c <= '7') + table[c] |= CMoctal; + if (c_isxdigit(c)) + table[c] |= CMhex; + if (c_isalnum(c) || c == '_') + table[c] |= CMidchar; + + switch (c) + { + case 'x': case 'X': + case 'b': case 'B': + table[c] |= CMzerosecond; + break; + + case '0': .. case '9': + case 'e': case 'E': + case 'f': case 'F': + case 'l': case 'L': + case 'p': case 'P': + case 'u': case 'U': + case 'i': + case '.': + case '_': + table[c] |= CMzerosecond | CMdigitsecond; + break; + + default: + break; + } + + switch (c) + { + case '\\': + case '\n': + case '\r': + case 0: + case 0x1A: + case '\'': + break; + default: + if (!(c & 0x80)) + table[c] |= CMsinglechar; + break; + } + } + return table; +}(); + +private +{ + enum CMoctal = 0x1; + enum CMhex = 0x2; + enum CMidchar = 0x4; + enum CMzerosecond = 0x8; + enum CMdigitsecond = 0x10; + enum CMsinglechar = 0x20; +} + +private bool isoctal(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMoctal) != 0; +} + +private bool ishex(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMhex) != 0; +} + +private bool isidchar(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMidchar) != 0; +} + +private bool isZeroSecond(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMzerosecond) != 0; +} + +private bool isDigitSecond(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMdigitsecond) != 0; +} + +private bool issinglechar(const char c) pure @nogc @safe +{ + return (cmtable[c] & CMsinglechar) != 0; +} + +private bool c_isxdigit(const int c) pure @nogc @safe +{ + return (( c >= '0' && c <= '9') || + ( c >= 'a' && c <= 'f') || + ( c >= 'A' && c <= 'F')); +} + +private bool c_isalnum(const int c) pure @nogc @safe +{ + return (( c >= '0' && c <= '9') || + ( c >= 'a' && c <= 'z') || + ( c >= 'A' && c <= 'Z')); +} + +unittest +{ + //printf("lexer.unittest\n"); + /* Not much here, just trying things out. + */ + string text = "int"; // We rely on the implicit null-terminator + scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0); + TOK tok; + tok = lex1.nextToken(); + //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); + assert(tok == TOK.int32); + tok = lex1.nextToken(); + assert(tok == TOK.endOfFile); + tok = lex1.nextToken(); + assert(tok == TOK.endOfFile); + tok = lex1.nextToken(); + assert(tok == TOK.endOfFile); +} + +unittest +{ + // We don't want to see Lexer error output during these tests. + uint errors = global.startGagging(); + scope(exit) global.endGagging(errors); + + // Test malformed input: even malformed input should end in a TOK.endOfFile. + static immutable char[][] testcases = + [ // Testcase must end with 0 or 0x1A. + [0], // not malformed, but pathological + ['\'', 0], + ['\'', 0x1A], + ['{', '{', 'q', '{', 0], + [0xFF, 0], + [0xFF, 0x80, 0], + [0xFF, 0xFF, 0], + [0xFF, 0xFF, 0], + ['x', '"', 0x1A], + ]; + + foreach (testcase; testcases) + { + scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0); + TOK tok = lex2.nextToken(); + size_t iterations = 1; + while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) + { + tok = lex2.nextToken(); + } + assert(tok == TOK.endOfFile); + tok = lex2.nextToken(); + assert(tok == TOK.endOfFile); + } +} + +version (DMDLIB) +{ + version = LocOffset; +} + +/*********************************************************** + */ +class Lexer +{ + private __gshared OutBuffer stringbuffer; + + Loc scanloc; // for error messages + Loc prevloc; // location of token before current + + const(char)* p; // current character + + Token token; + + // For ImportC + bool Ccompile; /// true if compiling ImportC + + // The following are valid only if (Ccompile == true) + ubyte longsize; /// size of C long, 4 or 8 + ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof + ubyte wchar_tsize; /// size of C wchar_t, 2 or 4 + + private + { + const(char)* base; // pointer to start of buffer + const(char)* end; // pointer to last element of buffer + const(char)* line; // start of current line + + bool doDocComment; // collect doc comment information + bool anyToken; // seen at least one token + bool commentToken; // comments are TOK.comment's + int inTokenStringConstant; // can be larger than 1 when in nested q{} strings + int lastDocLine; // last line of previous doc comment + + Token* tokenFreelist; + } + + nothrow: + + /********************* + * Creates a Lexer for the source code base[begoffset..endoffset+1]. + * The last character, base[endoffset], must be null (0) or EOF (0x1A). + * + * Params: + * filename = used for error messages + * base = source code, must be terminated by a null (0) or EOF (0x1A) character + * begoffset = starting offset into base[] + * endoffset = the last offset to read into base[] + * doDocComment = handle documentation comments + * commentToken = comments become TOK.comment's + */ + this(const(char)* filename, const(char)* base, size_t begoffset, + size_t endoffset, bool doDocComment, bool commentToken) pure + { + scanloc = Loc(filename, 1, 1); + //printf("Lexer::Lexer(%p,%d)\n",base,length); + //printf("lexer.filename = %s\n", filename); + token = Token.init; + this.base = base; + this.end = base + endoffset; + p = base + begoffset; + line = p; + this.doDocComment = doDocComment; + this.commentToken = commentToken; + this.inTokenStringConstant = 0; + this.lastDocLine = 0; + //initKeywords(); + /* If first line starts with '#!', ignore the line + */ + if (p && p[0] == '#' && p[1] == '!') + { + p += 2; + while (1) + { + char c = *p++; + switch (c) + { + case 0: + case 0x1A: + p--; + goto case; + case '\n': + break; + default: + continue; + } + break; + } + endOfLine(); + } + } + + /// Returns: a newly allocated `Token`. + Token* allocateToken() pure nothrow @safe + { + if (tokenFreelist) + { + Token* t = tokenFreelist; + tokenFreelist = t.next; + t.next = null; + return t; + } + return new Token(); + } + + /// Frees the given token by returning it to the freelist. + private void releaseToken(Token* token) pure nothrow @nogc @safe + { + if (mem.isGCEnabled) + *token = Token.init; + token.next = tokenFreelist; + tokenFreelist = token; + } + + final TOK nextToken() + { + prevloc = token.loc; + if (token.next) + { + Token* t = token.next; + memcpy(&token, t, Token.sizeof); + releaseToken(t); + } + else + { + scan(&token); + } + //printf(token.toChars()); + return token.value; + } + + /*********************** + * Look ahead at next token's value. + */ + final TOK peekNext() + { + return peek(&token).value; + } + + /*********************** + * Look 2 tokens ahead at value. + */ + final TOK peekNext2() + { + Token* t = peek(&token); + return peek(t).value; + } + + /**************************** + * Turn next token in buffer into a token. + */ + final void scan(Token* t) + { + const lastLine = scanloc.linnum; + Loc startLoc; + t.blockComment = null; + t.lineComment = null; + + while (1) + { + t.ptr = p; + //printf("p = %p, *p = '%c'\n",p,*p); + t.loc = loc(); + switch (*p) + { + case 0: + case 0x1A: + t.value = TOK.endOfFile; // end of file + // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. + return; + case ' ': + case '\t': + case '\v': + case '\f': + p++; + continue; // skip white space + case '\r': + p++; + if (*p != '\n') // if CR stands by itself + { + endOfLine(); + goto skipFourSpaces; + } + continue; // skip white space + case '\n': + p++; + endOfLine(); + skipFourSpaces: + while (*(cast(uint*)p) == 0x20202020) //' ' == 0x20 + { + p+=4; + } + continue; // skip white space + case '0': + if (!isZeroSecond(p[1])) // if numeric literal does not continue + { + ++p; + t.unsvalue = 0; + t.value = TOK.int32Literal; + return; + } + goto Lnumber; + + case '1': .. case '9': + if (!isDigitSecond(p[1])) // if numeric literal does not continue + { + t.unsvalue = *p - '0'; + ++p; + t.value = TOK.int32Literal; + return; + } + Lnumber: + t.value = number(t); + return; + + case '\'': + if (issinglechar(p[1]) && p[2] == '\'') + { + t.unsvalue = p[1]; // simple one character literal + t.value = Ccompile ? TOK.int32Literal : TOK.charLiteral; + p += 3; + } + else if (Ccompile) + { + clexerCharConstant(*t, 0); + } + else + { + t.value = charConstant(t); + } + return; + + case 'u': + case 'U': + case 'L': + if (!Ccompile) + goto case_ident; + if (p[1] == '\'') // C wide character constant + { + char c = *p; + if (c == 'L') // convert L to u or U + c = (wchar_tsize == 4) ? 'u' : 'U'; + ++p; + clexerCharConstant(*t, c); + return; + } + else if (p[1] == '\"') // C wide string literal + { + const c = *p; + ++p; + escapeStringConstant(t); + t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') : + c == 'u' ? 'w' : + 'd'; + return; + } + goto case_ident; + + case 'r': + if (p[1] != '"') + goto case_ident; + p++; + goto case '`'; + case '`': + wysiwygStringConstant(t); + return; + case 'x': + if (p[1] != '"') + goto case_ident; + p++; + auto start = p; + OutBuffer hexString; + t.value = hexStringConstant(t); + hexString.write(start[0 .. p - start]); + error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString.extractChars()); + return; + case 'q': + if (p[1] == '"') + { + p++; + delimitedStringConstant(t); + return; + } + else if (p[1] == '{') + { + p++; + tokenStringConstant(t); + return; + } + else + goto case_ident; + case '"': + escapeStringConstant(t); + return; + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'n': + case 'o': + case 'p': + /*case 'q': case 'r':*/ + case 's': + case 't': + //case 'u': + case 'v': + case 'w': + /*case 'x':*/ + case 'y': + case 'z': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'I': + case 'J': + case 'K': + //case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'R': + case 'S': + case 'T': + //case 'U': + case 'V': + case 'W': + case 'X': + case 'Y': + case 'Z': + case '_': + case_ident: + { + while (1) + { + const c = *++p; + if (isidchar(c)) + continue; + else if (c & 0x80) + { + const s = p; + const u = decodeUTF(); + if (isUniAlpha(u)) + continue; + error("char 0x%04x not allowed in identifier", u); + p = s; + } + break; + } + Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); + t.ident = id; + t.value = cast(TOK)id.getValue(); + + anyToken = 1; + + /* Different keywords for C and D + */ + if (Ccompile) + { + if (t.value != TOK.identifier) + { + t.value = Ckeywords[t.value]; // filter out D keywords + } + } + else if (t.value >= FirstCKeyword) + t.value = TOK.identifier; // filter out C keywords + + else if (*t.ptr == '_') // if special identifier token + { + // Lazy initialization + TimeStampInfo.initialize(t.loc); + + if (id == Id.DATE) + { + t.ustring = TimeStampInfo.date.ptr; + goto Lstr; + } + else if (id == Id.TIME) + { + t.ustring = TimeStampInfo.time.ptr; + goto Lstr; + } + else if (id == Id.VENDOR) + { + t.ustring = global.vendor.xarraydup.ptr; + goto Lstr; + } + else if (id == Id.TIMESTAMP) + { + t.ustring = TimeStampInfo.timestamp.ptr; + Lstr: + t.value = TOK.string_; + t.postfix = 0; + t.len = cast(uint)strlen(t.ustring); + } + else if (id == Id.VERSIONX) + { + t.value = TOK.int64Literal; + t.unsvalue = global.versionNumber(); + } + else if (id == Id.EOFX) + { + t.value = TOK.endOfFile; + // Advance scanner to end of file + while (!(*p == 0 || *p == 0x1A)) + p++; + } + } + //printf("t.value = %d\n",t.value); + return; + } + case '/': + p++; + switch (*p) + { + case '=': + p++; + t.value = TOK.divAssign; + return; + case '*': + p++; + startLoc = loc(); + while (1) + { + while (1) + { + const c = *p; + switch (c) + { + case '/': + break; + case '\n': + endOfLine(); + p++; + continue; + case '\r': + p++; + if (*p != '\n') + endOfLine(); + continue; + case 0: + case 0x1A: + error("unterminated /* */ comment"); + p = end; + t.loc = loc(); + t.value = TOK.endOfFile; + return; + default: + if (c & 0x80) + { + const u = decodeUTF(); + if (u == PS || u == LS) + endOfLine(); + } + p++; + continue; + } + break; + } + p++; + if (p[-2] == '*' && p - 3 != t.ptr) + break; + } + if (commentToken) + { + t.loc = startLoc; + t.value = TOK.comment; + return; + } + else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) + { + // if /** but not /**/ + getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); + lastDocLine = scanloc.linnum; + } + continue; + case '/': // do // style comments + startLoc = loc(); + while (1) + { + const c = *++p; + switch (c) + { + case '\n': + break; + case '\r': + if (p[1] == '\n') + p++; + break; + case 0: + case 0x1A: + if (commentToken) + { + p = end; + t.loc = startLoc; + t.value = TOK.comment; + return; + } + if (doDocComment && t.ptr[2] == '/') + { + getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); + lastDocLine = scanloc.linnum; + } + p = end; + t.loc = loc(); + t.value = TOK.endOfFile; + return; + default: + if (c & 0x80) + { + const u = decodeUTF(); + if (u == PS || u == LS) + break; + } + continue; + } + break; + } + if (commentToken) + { + p++; + endOfLine(); + t.loc = startLoc; + t.value = TOK.comment; + return; + } + if (doDocComment && t.ptr[2] == '/') + { + getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); + lastDocLine = scanloc.linnum; + } + p++; + endOfLine(); + continue; + case '+': + { + int nest; + startLoc = loc(); + p++; + nest = 1; + while (1) + { + char c = *p; + switch (c) + { + case '/': + p++; + if (*p == '+') + { + p++; + nest++; + } + continue; + case '+': + p++; + if (*p == '/') + { + p++; + if (--nest == 0) + break; + } + continue; + case '\r': + p++; + if (*p != '\n') + endOfLine(); + continue; + case '\n': + endOfLine(); + p++; + continue; + case 0: + case 0x1A: + error("unterminated /+ +/ comment"); + p = end; + t.loc = loc(); + t.value = TOK.endOfFile; + return; + default: + if (c & 0x80) + { + uint u = decodeUTF(); + if (u == PS || u == LS) + endOfLine(); + } + p++; + continue; + } + break; + } + if (commentToken) + { + t.loc = startLoc; + t.value = TOK.comment; + return; + } + if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) + { + // if /++ but not /++/ + getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); + lastDocLine = scanloc.linnum; + } + continue; + } + default: + break; + } + t.value = TOK.div; + return; + case '.': + p++; + if (isdigit(*p)) + { + /* Note that we don't allow ._1 and ._ as being + * valid floating point numbers. + */ + p--; + t.value = inreal(t); + } + else if (p[0] == '.') + { + if (p[1] == '.') + { + p += 2; + t.value = TOK.dotDotDot; + } + else + { + p++; + t.value = TOK.slice; + } + } + else + t.value = TOK.dot; + return; + case '&': + p++; + if (*p == '=') + { + p++; + t.value = TOK.andAssign; + } + else if (*p == '&') + { + p++; + t.value = TOK.andAnd; + } + else + t.value = TOK.and; + return; + case '|': + p++; + if (*p == '=') + { + p++; + t.value = TOK.orAssign; + } + else if (*p == '|') + { + p++; + t.value = TOK.orOr; + } + else + t.value = TOK.or; + return; + case '-': + p++; + if (*p == '=') + { + p++; + t.value = TOK.minAssign; + } + else if (*p == '-') + { + p++; + t.value = TOK.minusMinus; + } + else if (*p == '>') + { + ++p; + t.value = TOK.arrow; + } + else + t.value = TOK.min; + return; + case '+': + p++; + if (*p == '=') + { + p++; + t.value = TOK.addAssign; + } + else if (*p == '+') + { + p++; + t.value = TOK.plusPlus; + } + else + t.value = TOK.add; + return; + case '<': + p++; + if (*p == '=') + { + p++; + t.value = TOK.lessOrEqual; // <= + } + else if (*p == '<') + { + p++; + if (*p == '=') + { + p++; + t.value = TOK.leftShiftAssign; // <<= + } + else + t.value = TOK.leftShift; // << + } + else if (*p == ':' && Ccompile) + { + ++p; + t.value = TOK.leftBracket; // <: + } + else if (*p == '%' && Ccompile) + { + ++p; + t.value = TOK.leftCurly; // <% + } + else + t.value = TOK.lessThan; // < + return; + case '>': + p++; + if (*p == '=') + { + p++; + t.value = TOK.greaterOrEqual; // >= + } + else if (*p == '>') + { + p++; + if (*p == '=') + { + p++; + t.value = TOK.rightShiftAssign; // >>= + } + else if (*p == '>') + { + p++; + if (*p == '=') + { + p++; + t.value = TOK.unsignedRightShiftAssign; // >>>= + } + else + t.value = TOK.unsignedRightShift; // >>> + } + else + t.value = TOK.rightShift; // >> + } + else + t.value = TOK.greaterThan; // > + return; + case '!': + p++; + if (*p == '=') + { + p++; + t.value = TOK.notEqual; // != + } + else + t.value = TOK.not; // ! + return; + case '=': + p++; + if (*p == '=') + { + p++; + t.value = TOK.equal; // == + } + else if (*p == '>') + { + p++; + t.value = TOK.goesTo; // => + } + else + t.value = TOK.assign; // = + return; + case '~': + p++; + if (*p == '=') + { + p++; + t.value = TOK.concatenateAssign; // ~= + } + else + t.value = TOK.tilde; // ~ + return; + case '^': + p++; + if (*p == '^') + { + p++; + if (*p == '=') + { + p++; + t.value = TOK.powAssign; // ^^= + } + else + t.value = TOK.pow; // ^^ + } + else if (*p == '=') + { + p++; + t.value = TOK.xorAssign; // ^= + } + else + t.value = TOK.xor; // ^ + return; + case '(': + p++; + t.value = TOK.leftParenthesis; + return; + case ')': + p++; + t.value = TOK.rightParenthesis; + return; + case '[': + p++; + t.value = TOK.leftBracket; + return; + case ']': + p++; + t.value = TOK.rightBracket; + return; + case '{': + p++; + t.value = TOK.leftCurly; + return; + case '}': + p++; + t.value = TOK.rightCurly; + return; + case '?': + p++; + t.value = TOK.question; + return; + case ',': + p++; + t.value = TOK.comma; + return; + case ';': + p++; + t.value = TOK.semicolon; + return; + case ':': + p++; + if (*p == ':') + { + ++p; + t.value = TOK.colonColon; + } + else if (*p == '>' && Ccompile) + { + ++p; + t.value = TOK.rightBracket; + } + else + t.value = TOK.colon; + return; + case '$': + p++; + t.value = TOK.dollar; + return; + case '@': + p++; + t.value = TOK.at; + return; + case '*': + p++; + if (*p == '=') + { + p++; + t.value = TOK.mulAssign; + } + else + t.value = TOK.mul; + return; + case '%': + p++; + if (*p == '=') + { + p++; + t.value = TOK.modAssign; + } + else if (*p == '>' && Ccompile) + { + ++p; + t.value = TOK.rightCurly; + } + else if (*p == ':' && Ccompile) + { + goto case '#'; // %: means # + } + else + t.value = TOK.mod; + return; + case '#': + { + p++; + Token n; + scan(&n); + if (Ccompile && n.value == TOK.int32Literal) + { + poundLine(n, true); + continue; + } + if (n.value == TOK.identifier) + { + if (n.ident == Id.line) + { + poundLine(n, false); + continue; + } + else + { + const locx = loc(); + warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); + } + } + else if (n.value == TOK.if_) + { + error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); + } + t.value = TOK.pound; + return; + } + default: + { + dchar c = *p; + if (c & 0x80) + { + c = decodeUTF(); + // Check for start of unicode identifier + if (isUniAlpha(c)) + goto case_ident; + if (c == PS || c == LS) + { + endOfLine(); + p++; + continue; + } + } + if (c < 0x80 && isprint(c)) + error("character '%c' is not a valid token", c); + else + error("character 0x%02x is not a valid token", c); + p++; + continue; + } + } + } + } + + final Token* peek(Token* ct) + { + Token* t; + if (ct.next) + t = ct.next; + else + { + t = allocateToken(); + scan(t); + ct.next = t; + } + return t; + } + + /********************************* + * tk is on the opening (. + * Look ahead and return token that is past the closing ). + */ + final Token* peekPastParen(Token* tk) + { + //printf("peekPastParen()\n"); + int parens = 1; + int curlynest = 0; + while (1) + { + tk = peek(tk); + //tk.print(); + switch (tk.value) + { + case TOK.leftParenthesis: + parens++; + continue; + case TOK.rightParenthesis: + --parens; + if (parens) + continue; + tk = peek(tk); + break; + case TOK.leftCurly: + curlynest++; + continue; + case TOK.rightCurly: + if (--curlynest >= 0) + continue; + break; + case TOK.semicolon: + if (curlynest) + continue; + break; + case TOK.endOfFile: + break; + default: + continue; + } + return tk; + } + } + + /******************************************* + * Parse escape sequence. + */ + private uint escapeSequence() + { + return Lexer.escapeSequence(token.loc, p, Ccompile); + } + + /******** + * Parse the given string literal escape sequence into a single character. + * D https://dlang.org/spec/lex.html#escape_sequences + * C11 6.4.4.4 + * Params: + * loc = location to use for error messages + * sequence = pointer to string with escape sequence to parse. Updated to + * point past the end of the escape sequence + * Ccompile = true for compile C11 escape sequences + * Returns: + * the escape sequence as a single character + */ + private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile) + { + const(char)* p = sequence; // cache sequence reference on stack + scope(exit) sequence = p; + + uint c = *p; + int ndigits; + switch (c) + { + case '\'': + case '"': + case '?': + case '\\': + Lconsume: + p++; + break; + case 'a': + c = 7; + goto Lconsume; + case 'b': + c = 8; + goto Lconsume; + case 'f': + c = 12; + goto Lconsume; + case 'n': + c = 10; + goto Lconsume; + case 'r': + c = 13; + goto Lconsume; + case 't': + c = 9; + goto Lconsume; + case 'v': + c = 11; + goto Lconsume; + case 'u': + ndigits = 4; + goto Lhex; + case 'U': + ndigits = 8; + goto Lhex; + case 'x': + ndigits = 2; + Lhex: + p++; + c = *p; + if (ishex(cast(char)c)) + { + uint v = 0; + int n = 0; + while (1) + { + if (isdigit(cast(char)c)) + c -= '0'; + else if (islower(c)) + c -= 'a' - 10; + else + c -= 'A' - 10; + v = v * 16 + c; + c = *++p; + if (++n == ndigits) + break; + if (!ishex(cast(char)c)) + { + .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); + break; + } + } + if (ndigits != 2 && !utf_isValidDchar(v)) + { + .error(loc, "invalid UTF character \\U%08x", v); + v = '?'; // recover with valid UTF character + } + c = v; + } + else + { + .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); + p++; + } + break; + case '&': + if (Ccompile) + goto default; + + // named character entity + for (const idstart = ++p; 1; p++) + { + switch (*p) + { + case ';': + c = HtmlNamedEntity(idstart, p - idstart); + if (c == ~0) + { + .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); + c = '?'; + } + p++; + break; + default: + if (isalpha(*p) || (p != idstart && isdigit(*p))) + continue; + .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); + c = '?'; + break; + } + break; + } + break; + case 0: + case 0x1A: + // end of file + c = '\\'; + break; + default: + if (isoctal(cast(char)c)) + { + uint v = 0; + int n = 0; + do + { + v = v * 8 + (c - '0'); + c = *++p; + } + while (++n < 3 && isoctal(cast(char)c)); + c = v; + if (c > 0xFF) + .error(loc, "escape octal sequence \\%03o is larger than \\377", c); + } + else + { + .error(loc, "undefined escape sequence \\%c", c); + p++; + } + break; + } + return c; + } + + /** + Lex a wysiwyg string. `p` must be pointing to the first character before the + contents of the string literal. The character pointed to by `p` will be used as + the terminating character (i.e. backtick or double-quote). + Params: + result = pointer to the token that accepts the result + */ + private void wysiwygStringConstant(Token* result) + { + result.value = TOK.string_; + Loc start = loc(); + auto terminator = p[0]; + p++; + stringbuffer.setsize(0); + while (1) + { + dchar c = p[0]; + p++; + switch (c) + { + case '\n': + endOfLine(); + break; + case '\r': + if (p[0] == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + endOfLine(); + break; + case 0: + case 0x1A: + error("unterminated string constant starting at %s", start.toChars()); + result.setString(); + // rewind `p` so it points to the EOF character + p--; + return; + default: + if (c == terminator) + { + result.setString(stringbuffer); + stringPostfix(result); + return; + } + else if (c & 0x80) + { + p--; + const u = decodeUTF(); + p++; + if (u == PS || u == LS) + endOfLine(); + stringbuffer.writeUTF8(u); + continue; + } + break; + } + stringbuffer.writeByte(c); + } + } + + /************************************** + * Lex hex strings: + * x"0A ae 34FE BD" + */ + private TOK hexStringConstant(Token* t) + { + Loc start = loc(); + uint n = 0; + uint v = ~0; // dead assignment, needed to suppress warning + p++; + stringbuffer.setsize(0); + while (1) + { + dchar c = *p++; + switch (c) + { + case ' ': + case '\t': + case '\v': + case '\f': + continue; // skip white space + case '\r': + if (*p == '\n') + continue; // ignore '\r' if followed by '\n' + // Treat isolated '\r' as if it were a '\n' + goto case '\n'; + case '\n': + endOfLine(); + continue; + case 0: + case 0x1A: + error("unterminated string constant starting at %s", start.toChars()); + t.setString(); + // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). + p--; + return TOK.hexadecimalString; + case '"': + if (n & 1) + { + error("odd number (%d) of hex characters in hex string", n); + stringbuffer.writeByte(v); + } + t.setString(stringbuffer); + stringPostfix(t); + return TOK.hexadecimalString; + default: + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + else if (c & 0x80) + { + p--; + const u = decodeUTF(); + p++; + if (u == PS || u == LS) + endOfLine(); + else + error("non-hex character \\u%04x in hex string", u); + } + else + error("non-hex character '%c' in hex string", c); + if (n & 1) + { + v = (v << 4) | c; + stringbuffer.writeByte(v); + } + else + v = c; + n++; + break; + } + } + assert(0); // see bug 15731 + } + + /** + Lex a delimited string. Some examples of delimited strings are: + --- + q"(foo(xxx))" // "foo(xxx)" + q"[foo$(LPAREN)]" // "foo$(LPAREN)" + q"/foo]/" // "foo]" + q"HERE + foo + HERE" // "foo\n" + --- + It is assumed that `p` points to the opening double-quote '"'. + Params: + result = pointer to the token that accepts the result + */ + private void delimitedStringConstant(Token* result) + { + result.value = TOK.string_; + Loc start = loc(); + dchar delimleft = 0; + dchar delimright = 0; + uint nest = 1; + uint nestcount = ~0; // dead assignment, needed to suppress warning + Identifier hereid = null; + uint blankrol = 0; + uint startline = 0; + p++; + stringbuffer.setsize(0); + while (1) + { + dchar c = *p++; + //printf("c = '%c'\n", c); + switch (c) + { + case '\n': + Lnextline: + endOfLine(); + startline = 1; + if (blankrol) + { + blankrol = 0; + continue; + } + if (hereid) + { + stringbuffer.writeUTF8(c); + continue; + } + break; + case '\r': + if (*p == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + goto Lnextline; + case 0: + case 0x1A: + error("unterminated delimited string constant starting at %s", start.toChars()); + result.setString(); + // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). + p--; + return; + default: + if (c & 0x80) + { + p--; + c = decodeUTF(); + p++; + if (c == PS || c == LS) + goto Lnextline; + } + break; + } + if (delimleft == 0) + { + delimleft = c; + nest = 1; + nestcount = 1; + if (c == '(') + delimright = ')'; + else if (c == '{') + delimright = '}'; + else if (c == '[') + delimright = ']'; + else if (c == '<') + delimright = '>'; + else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) + { + // Start of identifier; must be a heredoc + Token tok; + p--; + scan(&tok); // read in heredoc identifier + if (tok.value != TOK.identifier) + { + error("identifier expected for heredoc, not %s", tok.toChars()); + delimright = c; + } + else + { + hereid = tok.ident; + //printf("hereid = '%s'\n", hereid.toChars()); + blankrol = 1; + } + nest = 0; + } + else + { + delimright = c; + nest = 0; + if (isspace(c)) + error("delimiter cannot be whitespace"); + } + } + else + { + if (blankrol) + { + error("heredoc rest of line should be blank"); + blankrol = 0; + continue; + } + if (nest == 1) + { + if (c == delimleft) + nestcount++; + else if (c == delimright) + { + nestcount--; + if (nestcount == 0) + goto Ldone; + } + } + else if (c == delimright) + goto Ldone; + if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) + { + Token tok; + auto psave = p; + p--; + scan(&tok); // read in possible heredoc identifier + //printf("endid = '%s'\n", tok.ident.toChars()); + if (tok.value == TOK.identifier && tok.ident is hereid) + { + /* should check that rest of line is blank + */ + goto Ldone; + } + p = psave; + } + stringbuffer.writeUTF8(c); + startline = 0; + } + } + Ldone: + if (*p == '"') + p++; + else if (hereid) + error("delimited string must end in %s\"", hereid.toChars()); + else + error("delimited string must end in %c\"", delimright); + result.setString(stringbuffer); + stringPostfix(result); + } + + /** + Lex a token string. Some examples of token strings are: + --- + q{ foo(xxx) } // " foo(xxx) " + q{foo$(LPAREN)} // "foo$(LPAREN)" + q{{foo}"}"} // "{foo}"}"" + --- + It is assumed that `p` points to the opening curly-brace. + Params: + result = pointer to the token that accepts the result + */ + private void tokenStringConstant(Token* result) + { + result.value = TOK.string_; + + uint nest = 1; + const start = loc(); + const pstart = ++p; + inTokenStringConstant++; + scope(exit) inTokenStringConstant--; + while (1) + { + Token tok; + scan(&tok); + switch (tok.value) + { + case TOK.leftCurly: + nest++; + continue; + case TOK.rightCurly: + if (--nest == 0) + { + result.setString(pstart, p - 1 - pstart); + stringPostfix(result); + return; + } + continue; + case TOK.endOfFile: + error("unterminated token string constant starting at %s", start.toChars()); + result.setString(); + return; + default: + continue; + } + } + } + + /** + Scan a quoted string while building the processed string value by + handling escape sequences. The result is returned in the given `t` token. + This function assumes that `p` currently points to the opening quote + of the string. + Params: + t = the token to set the resulting string to + * References: + * D https://dlang.org/spec/lex.html#double_quoted_strings + * ImportC C11 6.4.5 + */ + private void escapeStringConstant(Token* t) + { + t.value = TOK.string_; + + const start = loc(); + const tc = *p++; // opening quote + stringbuffer.setsize(0); + while (1) + { + dchar c = *p++; + switch (c) + { + case '\\': + switch (*p) + { + case '&': + if (Ccompile) + goto default; + goto case; + + case 'u': + case 'U': + c = escapeSequence(); + stringbuffer.writeUTF8(c); + continue; + default: + c = escapeSequence(); + break; + } + break; + case '\n': + endOfLine(); + if (Ccompile) + goto Lunterminated; + break; + case '\r': + if (*p == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + endOfLine(); + if (Ccompile) + goto Lunterminated; + break; + case '\'': + case '"': + if (c != tc) + goto default; + t.setString(stringbuffer); + if (!Ccompile) + stringPostfix(t); + return; + case 0: + case 0x1A: + // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). + p--; + Lunterminated: + error("unterminated string constant starting at %s", start.toChars()); + t.setString(); + return; + default: + if (c & 0x80) + { + p--; + c = decodeUTF(); + if (c == LS || c == PS) + { + c = '\n'; + endOfLine(); + if (Ccompile) + goto Lunterminated; + } + p++; + stringbuffer.writeUTF8(c); + continue; + } + break; + } + stringbuffer.writeByte(c); + } + } + + /************************************** + * Reference: + * https://dlang.org/spec/lex.html#characterliteral + */ + private TOK charConstant(Token* t) + { + TOK tk = TOK.charLiteral; + //printf("Lexer::charConstant\n"); + p++; + dchar c = *p++; + switch (c) + { + case '\\': + switch (*p) + { + case 'u': + t.unsvalue = escapeSequence(); + tk = TOK.wcharLiteral; + break; + case 'U': + case '&': + t.unsvalue = escapeSequence(); + tk = TOK.dcharLiteral; + break; + default: + t.unsvalue = escapeSequence(); + break; + } + break; + case '\n': + L1: + endOfLine(); + goto case; + case '\r': + goto case '\''; + case 0: + case 0x1A: + // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). + p--; + goto case; + case '\'': + error("unterminated character constant"); + t.unsvalue = '?'; + return tk; + default: + if (c & 0x80) + { + p--; + c = decodeUTF(); + p++; + if (c == LS || c == PS) + goto L1; + if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) + tk = TOK.wcharLiteral; + else + tk = TOK.dcharLiteral; + } + t.unsvalue = c; + break; + } + if (*p != '\'') + { + while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && + *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') + { + if (*p & 0x80) + { + const s = p; + c = decodeUTF(); + if (c == LS || c == PS) + { + p = s; + break; + } + } + p++; + } + + if (*p == '\'') + { + error("character constant has multiple characters"); + p++; + } + else + error("unterminated character constant"); + t.unsvalue = '?'; + return tk; + } + p++; + return tk; + } + + /*************************************** + * Lex C character constant. + * Parser is on the opening quote. + * Params: + * t = token to fill in + * prefix = one of `u`, `U` or 0. + * Reference: + * C11 6.4.4.4 + */ + private void clexerCharConstant(ref Token t, char prefix) + { + escapeStringConstant(&t); + const(char)[] str = t.ustring[0 .. t.len]; + const n = str.length; + const loc = t.loc; + if (n == 0) + { + error(loc, "empty character constant"); + t.value = TOK.semicolon; + return; + } + + uint u; + switch (prefix) + { + case 0: + if (n == 1) // fast case + { + u = str[0]; + } + else if (n > 4) + error(loc, "max number of chars in character literal is 4, had %d", + cast(int)n); + else + { + foreach (i, c; str) + (cast(char*)&u)[n - 1 - i] = c; + } + break; + + case 'u': + dchar d1; + size_t idx; + auto msg = utf_decodeChar(str, idx, d1); + dchar d2 = 0; + if (idx < n && !msg) + msg = utf_decodeChar(str, idx, d2); + if (msg) + error(loc, "%s", msg); + else if (idx < n) + error(loc, "max number of chars in 16 bit character literal is 2, had %d", + (n + 1) >> 1); + else if (d1 > 0x1_0000) + error(loc, "%d does not fit in 16 bits", d1); + else if (d2 > 0x1_0000) + error(loc, "%d does not fit in 16 bits", d2); + u = d1; + if (d2) + u = (d1 << 16) | d2; + break; + + case 'U': + dchar d; + size_t idx; + auto msg = utf_decodeChar(str, idx, d); + if (msg) + error(loc, "%s", msg); + else if (idx < n) + error(loc, "max number of chars in 32 bit character literal is 1, had %d", + (n + 3) >> 2); + u = d; + break; + + default: + assert(0); + } + t.value = TOK.int32Literal; + t.unsvalue = u; + } + + /*************************************** + * Get postfix of string literal. + */ + private void stringPostfix(Token* t) pure @nogc + { + switch (*p) + { + case 'c': + case 'w': + case 'd': + t.postfix = *p; + p++; + break; + default: + t.postfix = 0; + break; + } + } + + /************************************** + * Read in a number. + * If it's an integer, store it in tok.TKutok.Vlong. + * integers can be decimal, octal or hex + * Handle the suffixes U, UL, LU, L, etc. + * If it's double, store it in tok.TKutok.Vdouble. + * Returns: + * TKnum + * TKdouble,... + */ + private TOK number(Token* t) + { + int base = 10; + const start = p; + uinteger_t n = 0; // unsigned >=64 bit integer type + int d; + bool err = false; + bool overflow = false; + bool anyBinaryDigitsNoSingleUS = false; + bool anyHexDigitsNoSingleUS = false; + dchar c = *p; + if (c == '0') + { + ++p; + c = *p; + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + base = 8; + break; + + case '8': + case '9': + if (Ccompile) + error("octal digit expected, not `%c`", c); + base = 8; + break; + case 'x': + case 'X': + ++p; + base = 16; + break; + case 'b': + case 'B': + if (Ccompile) + error("binary constants not allowed"); + ++p; + base = 2; + break; + case '.': + if (p[1] == '.') + goto Ldone; // if ".." + if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) + goto Ldone; // if ".identifier" or ".unicode" + goto Lreal; // '.' is part of current token + case 'i': + case 'f': + case 'F': + goto Lreal; + case '_': + if (Ccompile) + error("embedded `_` not allowed"); + ++p; + base = 8; + break; + case 'L': + if (p[1] == 'i') + goto Lreal; + break; + default: + break; + } + } + while (1) + { + c = *p; + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ++p; + d = c - '0'; + break; + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + ++p; + if (base != 16) + { + if (c == 'e' || c == 'E' || c == 'f' || c == 'F') + goto Lreal; + } + if (c >= 'a') + d = c + 10 - 'a'; + else + d = c + 10 - 'A'; + break; + case 'L': + if (p[1] == 'i') + goto Lreal; + goto Ldone; + case '.': + if (p[1] == '.') + goto Ldone; // if ".." + if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) + goto Ldone; // if ".identifier" or ".unicode" + if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) + goto Ldone; // if ".identifier" or ".unicode" + if (base == 2) + goto Ldone; // if ".identifier" or ".unicode" + goto Lreal; // otherwise as part of a floating point literal + case 'p': + case 'P': + case 'i': + Lreal: + p = start; + return inreal(t); + case '_': + if (Ccompile) + goto default; + ++p; + continue; + default: + goto Ldone; + } + // got a digit here, set any necessary flags, check for errors + anyHexDigitsNoSingleUS = true; + anyBinaryDigitsNoSingleUS = true; + if (!err && d >= base) + { + error("%s digit expected, not `%c`", base == 2 ? "binary".ptr : + base == 8 ? "octal".ptr : + "decimal".ptr, c); + err = true; + } + // Avoid expensive overflow check if we aren't at risk of overflow + if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) + n = n * base + d; + else + { + import core.checkedint : mulu, addu; + + n = mulu(n, base, overflow); + n = addu(n, d, overflow); + } + } + Ldone: + if (overflow && !err) + { + error("integer overflow"); + err = true; + } + if ((base == 2 && !anyBinaryDigitsNoSingleUS) || + (base == 16 && !anyHexDigitsNoSingleUS)) + error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); + + t.unsvalue = n; + + if (Ccompile) + return cnumber(base, n); + + enum FLAGS : int + { + none = 0, + decimal = 1, // decimal + unsigned = 2, // u or U suffix + long_ = 4, // L suffix + } + + FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; + // Parse trailing 'u', 'U', 'l' or 'L' in any combination + const psuffix = p; + while (1) + { + FLAGS f; + switch (*p) + { + case 'U': + case 'u': + f = FLAGS.unsigned; + goto L1; + case 'l': + f = FLAGS.long_; + error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); + goto L1; + case 'L': + f = FLAGS.long_; + L1: + p++; + if ((flags & f) && !err) + { + error("unrecognized token"); + err = true; + } + flags = cast(FLAGS)(flags | f); + continue; + default: + break; + } + break; + } + if (base == 8 && n >= 8) + { + if (err) + // can't translate invalid octal value, just show a generic message + error("octal literals larger than 7 are no longer supported"); + else + error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead", + n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); + } + TOK result; + switch (flags) + { + case FLAGS.none: + /* Octal or Hexadecimal constant. + * First that fits: int, uint, long, ulong + */ + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else if (n & 0xFFFFFFFF00000000L) + result = TOK.int64Literal; + else if (n & 0x80000000) + result = TOK.uns32Literal; + else + result = TOK.int32Literal; + break; + case FLAGS.decimal: + /* First that fits: int, long, long long + */ + if (n & 0x8000000000000000L) + { + result = TOK.uns64Literal; + } + else if (n & 0xFFFFFFFF80000000L) + result = TOK.int64Literal; + else + result = TOK.int32Literal; + break; + case FLAGS.unsigned: + case FLAGS.decimal | FLAGS.unsigned: + /* First that fits: uint, ulong + */ + if (n & 0xFFFFFFFF00000000L) + result = TOK.uns64Literal; + else + result = TOK.uns32Literal; + break; + case FLAGS.decimal | FLAGS.long_: + if (n & 0x8000000000000000L) + { + if (!err) + { + error("signed integer overflow"); + err = true; + } + result = TOK.uns64Literal; + } + else + result = TOK.int64Literal; + break; + case FLAGS.long_: + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else + result = TOK.int64Literal; + break; + case FLAGS.unsigned | FLAGS.long_: + case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: + result = TOK.uns64Literal; + break; + default: + debug + { + printf("%x\n", flags); + } + assert(0); + } + return result; + } + + /************************************** + * Lex C integer-suffix + * Params: + * base = number base + * n = raw integer value + * Returns: + * token value + */ + private TOK cnumber(int base, uinteger_t n) + { + /* C11 6.4.4.1 + * Parse trailing suffixes: + * u or U + * l or L + * ll or LL + */ + enum FLAGS : uint + { + octalhex = 1, // octal or hexadecimal + decimal = 2, // decimal + unsigned = 4, // u or U suffix + long_ = 8, // l or L suffix + llong = 0x10 // ll or LL + } + FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex; + bool err; + Lsuffixes: + while (1) + { + FLAGS f; + const cs = *p; + switch (cs) + { + case 'U': + case 'u': + f = FLAGS.unsigned; + break; + + case 'l': + case 'L': + f = FLAGS.long_; + if (cs == p[1]) + { + f = FLAGS.long_ | FLAGS.llong; + ++p; + } + break; + + default: + break Lsuffixes; + } + ++p; + if ((flags & f) && !err) + { + error("duplicate integer suffixes"); + err = true; + } + flags = cast(FLAGS)(flags | f); + } + + void overflow() + { + error("integer overflow"); + } + + TOK result = TOK.int32Literal; // default + switch (flags) + { + /* Since D doesn't have a variable sized `long` or `unsigned long` type, + * this code deviates from C by picking D int, uint, long, or ulong instead + */ + + case FLAGS.octalhex: + /* Octal or Hexadecimal constant. + * First that fits: int, unsigned, long, unsigned long, + * long long, unsigned long long + */ + if (longsize == 4) + { + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else if (n & 0xFFFFFFFF00000000L) + result = TOK.int64Literal; + else if (n & 0x80000000) + result = TOK.uns32Literal; + else + result = TOK.int32Literal; + } + else + { + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; // unsigned long + else if (n & 0xFFFFFFFF00000000L) + result = TOK.int64Literal; // long + else if (n & 0x80000000) + result = TOK.uns32Literal; + else + result = TOK.int32Literal; + } + break; + + case FLAGS.decimal: + /* First that fits: int, long, long long + */ + if (longsize == 4) + { + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else if (n & 0xFFFFFFFF80000000L) + result = TOK.int64Literal; + else + result = TOK.int32Literal; + } + else + { + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; // unsigned long + else if (n & 0xFFFFFFFF80000000L) + result = TOK.int64Literal; // long + else + result = TOK.int32Literal; + } + break; + + case FLAGS.octalhex | FLAGS.unsigned: + case FLAGS.decimal | FLAGS.unsigned: + /* First that fits: unsigned, unsigned long, unsigned long long + */ + if (longsize == 4) + { + if (n & 0xFFFFFFFF00000000L) + result = TOK.uns64Literal; + else + result = TOK.uns32Literal; + } + else + { + if (n & 0xFFFFFFFF00000000L) + result = TOK.uns64Literal; // unsigned long + else + result = TOK.uns32Literal; + } + break; + + case FLAGS.decimal | FLAGS.long_: + /* First that fits: long, long long + */ + if (longsize == 4) + { + if (n & 0x8000000000000000L) + overflow(); + else if (n & 0xFFFFFFFF_80000000L) + result = TOK.int64Literal; + else + result = TOK.int32Literal; // long + } + else + { + if (n & 0x8000000000000000L) + overflow(); + else + result = TOK.int64Literal; // long + } + break; + + case FLAGS.octalhex | FLAGS.long_: + /* First that fits: long, unsigned long, long long, + * unsigned long long + */ + if (longsize == 4) + { + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else if (n & 0xFFFFFFFF00000000L) + result = TOK.int64Literal; + else if (n & 0x80000000) + result = TOK.uns32Literal; // unsigned long + else + result = TOK.int32Literal; // long + } + else + { + if (n & 0x80000000_00000000L) + result = TOK.uns64Literal; // unsigned long + else + result = TOK.int64Literal; // long + } + break; + + case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_: + case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: + /* First that fits: unsigned long, unsigned long long + */ + if (longsize == 4) + { + if (n & 0xFFFFFFFF00000000L) + result = TOK.uns64Literal; + else + result = TOK.uns32Literal; // unsigned long + } + else + { + result = TOK.uns64Literal; // unsigned long + } + break; + + case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong: + /* First that fits: long long, unsigned long long + */ + if (n & 0x8000000000000000L) + result = TOK.uns64Literal; + else + result = TOK.int64Literal; + break; + + case FLAGS.decimal | FLAGS.long_ | FLAGS.llong: + /* long long + */ + result = TOK.int64Literal; + break; + + case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: + case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: + result = TOK.uns64Literal; + break; + + default: + debug printf("%x\n",flags); + assert(0); + } + return result; + } + + /************************************** + * Read in characters, converting them to real. + * Bugs: + * Exponent overflow not detected. + * Too much requested precision is not detected. + */ + private TOK inreal(Token* t) + { + //printf("Lexer::inreal()\n"); + debug + { + assert(*p == '.' || isdigit(*p)); + } + bool isWellformedString = true; + stringbuffer.setsize(0); + auto pstart = p; + bool hex = false; + dchar c = *p++; + // Leading '0x' + if (c == '0') + { + c = *p++; + if (c == 'x' || c == 'X') + { + hex = true; + c = *p++; + } + } + // Digits to left of '.' + while (1) + { + if (c == '.') + { + c = *p++; + break; + } + if (isdigit(c) || (hex && isxdigit(c)) || c == '_') + { + c = *p++; + continue; + } + break; + } + // Digits to right of '.' + while (1) + { + if (isdigit(c) || (hex && isxdigit(c)) || c == '_') + { + c = *p++; + continue; + } + break; + } + if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) + { + c = *p++; + if (c == '-' || c == '+') + { + c = *p++; + } + bool anyexp = false; + while (1) + { + if (isdigit(c)) + { + anyexp = true; + c = *p++; + continue; + } + if (c == '_') + { + if (Ccompile) + error("embedded `_` in numeric literals not allowed"); + c = *p++; + continue; + } + if (!anyexp) + { + error("missing exponent"); + isWellformedString = false; + } + break; + } + } + else if (hex) + { + error("exponent required for hex float"); + isWellformedString = false; + } + --p; + while (pstart < p) + { + if (*pstart != '_') + stringbuffer.writeByte(*pstart); + ++pstart; + } + stringbuffer.writeByte(0); + auto sbufptr = cast(const(char)*)stringbuffer[].ptr; + TOK result; + bool isOutOfRange = false; + t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero); + switch (*p) + { + case 'F': + case 'f': + if (isWellformedString && !isOutOfRange) + isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); + result = TOK.float32Literal; + p++; + break; + default: + if (isWellformedString && !isOutOfRange) + isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); + result = TOK.float64Literal; + break; + case 'l': + if (!Ccompile) + error("use 'L' suffix instead of 'l'"); + goto case 'L'; + case 'L': + ++p; + if (Ccompile && long_doublesize == 8) + goto default; + result = TOK.float80Literal; + break; + } + if ((*p == 'i' || *p == 'I') && !Ccompile) + { + if (*p == 'I') + error("use 'i' suffix instead of 'I'"); + p++; + switch (result) + { + case TOK.float32Literal: + result = TOK.imaginary32Literal; + break; + case TOK.float64Literal: + result = TOK.imaginary64Literal; + break; + case TOK.float80Literal: + result = TOK.imaginary80Literal; + break; + default: + break; + } + } + const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); + if (isOutOfRange && !isLong) + { + const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : ""; + error(scanloc, "number `%s%s` is not representable", sbufptr, suffix); + } + debug + { + switch (result) + { + case TOK.float32Literal: + case TOK.float64Literal: + case TOK.float80Literal: + case TOK.imaginary32Literal: + case TOK.imaginary64Literal: + case TOK.imaginary80Literal: + break; + default: + assert(0); + } + } + return result; + } + + final Loc loc() pure @nogc + { + scanloc.charnum = cast(uint)(1 + p - line); + version (LocOffset) + scanloc.fileOffset = cast(uint)(p - base); + return scanloc; + } + + final void error(const(char)* format, ...) + { + va_list args; + va_start(args, format); + .verror(token.loc, format, args); + va_end(args); + } + + final void error(const ref Loc loc, const(char)* format, ...) + { + va_list args; + va_start(args, format); + .verror(loc, format, args); + va_end(args); + } + + final void deprecation(const(char)* format, ...) + { + va_list args; + va_start(args, format); + .vdeprecation(token.loc, format, args); + va_end(args); + } + + /********************************************* + * Parse line/file preprocessor directive: + * #line linnum [filespec] + * Allow __LINE__ for linnum, and __FILE__ for filespec. + * Accept linemarker format: + * # linnum [filespec] {flags} + * There can be zero or more flags, which are one of the digits 1..4, and + * must be in ascending order. The flags are ignored. + * Params: + * tok = token we're on, which is linnum of linemarker + * linemarker = true if line marker format and lexer is on linnum + * References: + * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html + */ + private void poundLine(ref Token tok, bool linemarker) + { + auto linnum = this.scanloc.linnum; + const(char)* filespec = null; + const loc = this.loc(); + bool flags; + + if (!linemarker) + scan(&tok); + if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) + { + const lin = cast(int)(tok.unsvalue - 1); + if (lin != tok.unsvalue - 1) + error("line number `%lld` out of range", cast(ulong)tok.unsvalue); + else + linnum = lin; + } + else if (tok.value == TOK.line) // #line __LINE__ + { + } + else + goto Lerr; + while (1) + { + switch (*p) + { + case 0: + case 0x1A: + case '\n': + Lnewline: + if (!inTokenStringConstant) + { + this.scanloc.linnum = linnum; + if (filespec) + this.scanloc.filename = filespec; + } + return; + case '\r': + p++; + if (*p != '\n') + { + p--; + goto Lnewline; + } + continue; + case ' ': + case '\t': + case '\v': + case '\f': + p++; + continue; // skip white space + case '_': + if (filespec || flags) + goto Lerr; + if (memcmp(p, "__FILE__".ptr, 8) == 0) + { + p += 8; + filespec = mem.xstrdup(scanloc.filename); + continue; + } + goto Lerr; + case '"': + if (filespec || flags) + goto Lerr; + stringbuffer.setsize(0); + p++; + while (1) + { + uint c; + c = *p; + switch (c) + { + case '\n': + case '\r': + case 0: + case 0x1A: + goto Lerr; + case '"': + stringbuffer.writeByte(0); + filespec = mem.xstrdup(cast(const(char)*)stringbuffer[].ptr); + p++; + break; + default: + if (c & 0x80) + { + uint u = decodeUTF(); + if (u == PS || u == LS) + goto Lerr; + } + stringbuffer.writeByte(c); + p++; + continue; + } + break; + } + continue; + + case '1': + case '2': + case '3': + case '4': + flags = true; // linemarker flags seen + ++p; + if ('0' <= *p && *p <= '9') + goto Lerr; // only one digit allowed + continue; + + default: + if (*p & 0x80) + { + uint u = decodeUTF(); + if (u == PS || u == LS) + goto Lnewline; + } + goto Lerr; + } + } + Lerr: + if (linemarker) + error(loc, "# integer [\"filespec\"] { 1 | 2 | 3 | 4 }\\n expected"); + else + error(loc, "#line integer [\"filespec\"]\\n expected"); + } + + /******************************************** + * Decode UTF character. + * Issue error messages for invalid sequences. + * Return decoded character, advance p to last character in UTF sequence. + */ + private uint decodeUTF() + { + const s = p; + assert(*s & 0x80); + // Check length of remaining string up to 4 UTF-8 characters + size_t len; + for (len = 1; len < 4 && s[len]; len++) + { + } + size_t idx = 0; + dchar u; + const msg = utf_decodeChar(s[0 .. len], idx, u); + p += idx - 1; + if (msg) + { + error("%.*s", cast(int)msg.length, msg.ptr); + } + return u; + } + + /*************************************************** + * Parse doc comment embedded between t.ptr and p. + * Remove trailing blanks and tabs from lines. + * Replace all newlines with \n. + * Remove leading comment character from each line. + * Decide if it's a lineComment or a blockComment. + * Append to previous one for this token. + * + * If newParagraph is true, an extra newline will be + * added between adjoining doc comments. + */ + private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure + { + /* ct tells us which kind of comment it is: '/', '*', or '+' + */ + const ct = t.ptr[2]; + /* Start of comment text skips over / * *, / + +, or / / / + */ + const(char)* q = t.ptr + 3; // start of comment text + const(char)* qend = p; + if (ct == '*' || ct == '+') + qend -= 2; + /* Scan over initial row of ****'s or ++++'s or ////'s + */ + for (; q < qend; q++) + { + if (*q != ct) + break; + } + /* Remove leading spaces until start of the comment + */ + int linestart = 0; + if (ct == '/') + { + while (q < qend && (*q == ' ' || *q == '\t')) + ++q; + } + else if (q < qend) + { + if (*q == '\r') + { + ++q; + if (q < qend && *q == '\n') + ++q; + linestart = 1; + } + else if (*q == '\n') + { + ++q; + linestart = 1; + } + } + /* Remove trailing row of ****'s or ++++'s + */ + if (ct != '/') + { + for (; q < qend; qend--) + { + if (qend[-1] != ct) + break; + } + } + /* Comment is now [q .. qend]. + * Canonicalize it into buf[]. + */ + OutBuffer buf; + + void trimTrailingWhitespace() + { + const s = buf[]; + auto len = s.length; + while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) + --len; + buf.setsize(len); + } + + for (; q < qend; q++) + { + char c = *q; + switch (c) + { + case '*': + case '+': + if (linestart && c == ct) + { + linestart = 0; + /* Trim preceding whitespace up to preceding \n + */ + trimTrailingWhitespace(); + continue; + } + break; + case ' ': + case '\t': + break; + case '\r': + if (q[1] == '\n') + continue; // skip the \r + goto Lnewline; + default: + if (c == 226) + { + // If LS or PS + if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) + { + q += 2; + goto Lnewline; + } + } + linestart = 0; + break; + Lnewline: + c = '\n'; // replace all newlines with \n + goto case; + case '\n': + linestart = 1; + /* Trim trailing whitespace + */ + trimTrailingWhitespace(); + break; + } + buf.writeByte(c); + } + /* Trim trailing whitespace (if the last line does not have newline) + */ + trimTrailingWhitespace(); + + // Always end with a newline + const s = buf[]; + if (s.length == 0 || s[$ - 1] != '\n') + buf.writeByte('\n'); + + // It's a line comment if the start of the doc comment comes + // after other non-whitespace on the same line. + auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; + // Combine with previous doc comment, if any + if (*dc) + *dc = combineComments(*dc, buf[], newParagraph).toDString(); + else + *dc = buf.extractSlice(true); + } + + /******************************************** + * Combine two document comments into one, + * separated by an extra newline if newParagraph is true. + */ + static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure + { + //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph); + const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' + if (!c1) + return c2.ptr; + if (!c2) + return c1.ptr; + + int insertNewLine = 0; + if (c1.length && c1[$ - 1] != '\n') + insertNewLine = 1; + const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; + auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); + p[0 .. c1.length] = c1[]; + if (insertNewLine) + p[c1.length] = '\n'; + if (newParagraph) + p[c1.length + insertNewLine] = '\n'; + p[retSize - c2.length .. retSize] = c2[]; + p[retSize] = 0; + return p; + } + +private: + void endOfLine() pure @nogc @safe + { + scanloc.linnum++; + line = p; + } +} + +/// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` +private struct TimeStampInfo +{ + private __gshared bool initdone = false; + + // Note: Those properties need to be guarded by a call to `init` + // The API isn't safe, and quite brittle, but it was left this way + // over performance concerns. + // This is currently only called once, from the lexer. + __gshared char[11 + 1] date; + __gshared char[8 + 1] time; + __gshared char[24 + 1] timestamp; + + public static void initialize(const ref Loc loc) nothrow + { + if (initdone) + return; + + initdone = true; + time_t ct; + // https://issues.dlang.org/show_bug.cgi?id=20444 + if (auto p = getenv("SOURCE_DATE_EPOCH")) + { + if (!ct.parseDigits(p.toDString())) + error(loc, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); + } + else + .time(&ct); + const p = ctime(&ct); + assert(p); + sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); + sprintf(&time[0], "%.8s", p + 11); + sprintf(×tamp[0], "%.24s", p); + } +} + +unittest +{ + import dmd.console; + nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, + const(char)* format, va_list ap, const(char)* p1, const(char)* p2) + { + assert(0); + } + diagnosticHandler = &assertDiagnosticHandler; + + static void test(T)(string sequence, T expected, bool Ccompile = false) + { + auto p = cast(const(char)*)sequence.ptr; + assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile)); + assert(p == sequence.ptr + sequence.length); + } + + test(`'`, '\''); + test(`"`, '"'); + test(`?`, '?'); + test(`\`, '\\'); + test(`0`, '\0'); + test(`a`, '\a'); + test(`b`, '\b'); + test(`f`, '\f'); + test(`n`, '\n'); + test(`r`, '\r'); + test(`t`, '\t'); + test(`v`, '\v'); + + test(`x00`, 0x00); + test(`xff`, 0xff); + test(`xFF`, 0xff); + test(`xa7`, 0xa7); + test(`x3c`, 0x3c); + test(`xe2`, 0xe2); + + test(`1`, '\1'); + test(`42`, '\42'); + test(`357`, '\357'); + + test(`u1234`, '\u1234'); + test(`uf0e4`, '\uf0e4'); + + test(`U0001f603`, '\U0001f603'); + + test(`"`, '"'); + test(`<`, '<'); + test(`>`, '>'); + + diagnosticHandler = null; +} +unittest +{ + import dmd.console; + string expected; + bool gotError; + + nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, + const(char)* format, va_list ap, const(char)* p1, const(char)* p2) + { + assert(cast(Classification)headerColor == Classification.error); + + gotError = true; + char[100] buffer = void; + auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)]; + assert(expected == actual); + return true; + } + + diagnosticHandler = &expectDiagnosticHandler; + + void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false) + { + uint errors = global.errors; + gotError = false; + expected = expectedError; + auto p = cast(const(char)*)sequence.ptr; + auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile); + assert(gotError); + assert(expectedReturnValue == actualReturnValue); + + auto actualScanLength = p - sequence.ptr; + assert(expectedScanLength == actualScanLength); + global.errors = errors; + } + + test("c", `undefined escape sequence \c`, 'c', 1); + test("!", `undefined escape sequence \!`, '!', 1); + test(""", `undefined escape sequence \&`, '&', 1, true); + + test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); + + test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); + test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); + test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); + + test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); + test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); + test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); + test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); + test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); + test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); + test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); + + test("ud800" , `invalid UTF character \U0000d800`, '?', 5); + test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); + test("U00110000", `invalid UTF character \U00110000`, '?', 9); + + test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); + test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); + test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); + + test("&BAD;", `unnamed character entity &BAD;` , '?', 5); + test(""", `unterminated named entity "`, '?', 5); + test(""", `unterminated named entity "`, '?', 5); + + test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); + + diagnosticHandler = null; +} |