diff options
Diffstat (limited to 'gcc/d/dmd/lexer.d')
-rw-r--r-- | gcc/d/dmd/lexer.d | 335 |
1 files changed, 204 insertions, 131 deletions
diff --git a/gcc/d/dmd/lexer.d b/gcc/d/dmd/lexer.d index c9c506e..26a56c2 100644 --- a/gcc/d/dmd/lexer.d +++ b/gcc/d/dmd/lexer.d @@ -22,9 +22,11 @@ import dmd.errorsink; import dmd.id; import dmd.identifier; import dmd.location; +import dmd.common.smallbuffer; +import dmd.common.outbuffer; +import dmd.common.charactertables; import dmd.root.array; import dmd.root.ctfloat; -import dmd.common.outbuffer; import dmd.root.port; import dmd.root.rmem; import dmd.root.utf; @@ -42,6 +44,8 @@ version (DMDLIB) */ struct CompileEnv { + import dmd.common.charactertables; + uint versionNumber; /// __VERSION__ const(char)[] date; /// __DATE__ const(char)[] time; /// __TIME__ @@ -51,6 +55,10 @@ struct CompileEnv bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues bool ddocOutput; /// collect embedded documentation comments bool masm; /// use MASM inline asm syntax + + // these need a default otherwise tests won't work. + IdentifierCharLookup cCharLookupTable; /// C identifier table (set to the lexer by the C parser) + IdentifierCharLookup dCharLookupTable; /// D identifier table } /*********************************************************** @@ -66,6 +74,8 @@ class Lexer Token token; + IdentifierCharLookup charLookup; /// Character table for identifiers + // For ImportC bool Ccompile; /// true if compiling ImportC @@ -142,6 +152,8 @@ class Lexer { this.compileEnv.versionNumber = 1; this.compileEnv.vendor = "DLF"; + this.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR); + this.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR); } //initKeywords(); /* If first line starts with '#!', ignore the line @@ -175,6 +187,11 @@ class Lexer } endOfLine(); } + + // setup the identifier table lookup functions + // C tables are setup in its parser constructor + // Due to us not knowing if we're in C at this point in time. + charLookup = this.compileEnv.dCharLookupTable; } /*********************** @@ -306,6 +323,8 @@ class Lexer t.blockComment = null; t.lineComment = null; + size_t universalCharacterName4, universalCharacterName8; + while (1) { t.ptr = p; @@ -395,10 +414,35 @@ class Lexer continue; // skip white space case '\\': - if (Ccompile && (p[1] == '\r' || p[1] == '\n')) + if (Ccompile) { - ++p; // ignore \ followed by new line, like VC does - continue; + if (p[1] == '\r' || p[1] == '\n') + { + ++p; // ignore \ followed by new line, like VC does + continue; + } + else if (p[1] == 'u') + { + // Universal Character Name (C) 2 byte + // \uXXXX + // let the main case handling for identifiers process this + + // case_indent will always increment, so subtract to prevent branching on the fast path + p--; + + goto case_ident; + } + else if (p[1] == 'U') + { + // Universal Character Name (C) 4 byte + // \UXXXXXXXX + // let the main case handling for identifiers process this + + // case_indent will always increment, so subtract to prevent branching on the fast path + p--; + + goto case_ident; + } } goto default; @@ -586,23 +630,161 @@ class Lexer case '_': case_ident: { - while (1) + IdentLoop: while (1) { + // If this is changed, change the decrement in C's universal character name code above + // For syntax \uXXXX and \UXXXXXXXX const c = *++p; + + // Is this the first character of the identifier + // For the universal character name this will line up, + // for the main switch it won't since it wasn't the first, + // for the default it won't either because a decode increments. + const isStartCharacter = t.ptr is p; + if (isidchar(c)) continue; else if (c & 0x80) { const s = p; const u = decodeUTF(); - if (isUniAlpha(u)) - continue; - error(t.loc, "char 0x%04x not allowed in identifier", u); + + if (isStartCharacter) + { + if (charLookup.isStart(u)) + continue; + error(t.loc, "character 0x%04x is not allowed as a start character in an identifier", u); + } + else + { + if (charLookup.isContinue(u)) + continue; + error(t.loc, "character 0x%04x is not allowed as a continue character in an identifier", u); + } + p = s; } + else if (Ccompile && c == '\\') + { + uint times; + const s = p; + p++; + + if (*p == 'u') + { + // Universal Character Name (C) 2 byte + // \uXXXX + p++; + times = 4; + } + else if (*p == 'U') + { + // Universal Character Name (C) 4 byte + // \UXXXXXXXX + p++; + times = 8; + } + else + { + error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid u/U", *p); + p = s; + break; + } + + foreach(_; 0 .. times) + { + const hc = *p; + p++; + + if ((hc >= '0' && hc <= '9') || (hc >= 'a' && hc <= 'f') || (hc >= 'A' && hc <= 'F')) + continue; + + error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid hex digit", hc); + p = s; + break IdentLoop; + } + + continue; + } break; } - Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false); + + Identifier id; + + if (universalCharacterName4 > 0 || universalCharacterName8 > 0) + { + auto priorValidation = t.ptr[0 .. p - t.ptr]; + const(char)* priorVPtr = priorValidation.ptr; + const possibleLength = ( + priorValidation.length - ( + (universalCharacterName4 * 6) + + (universalCharacterName8 * 10) + )) + ( + (universalCharacterName4 * 3) + + (universalCharacterName8 * 4) + ); + + char[64] buffer = void; + SmallBuffer!char sb = SmallBuffer!char(possibleLength, buffer[]); + + char[] storage = sb.extent; + size_t offset; + + while(priorVPtr < &priorValidation[$-1] + 1) + { + if (*priorVPtr == '\\') + { + dchar tempDchar = 0; + uint times; + + // universal character name (C) + if (priorVPtr[1] == 'u') + times = 4; + else if (priorVPtr[1] == 'U') + times = 8; + else + assert(0, "ICE: Universal character name is 2 or 4 bytes only"); + priorVPtr += 2; + + foreach(_; 0 .. times) + { + char c = *++priorVPtr; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + + tempDchar <<= 4; + tempDchar |= c; + } + + utf_encodeChar(&storage[offset], tempDchar); + offset += utf_codeLengthChar(tempDchar); + + // Could be an error instead of a warning, + // but hey it was written specifically so why worry? + if (priorVPtr is priorValidation.ptr) + { + if (!charLookup.isStart(tempDchar)) + warning(t.loc, "char 0x%x is not allowed start character for an identifier", tempDchar); + } + else + { + if (!charLookup.isContinue(tempDchar)) + warning(t.loc, "char 0x%x is not allowed continue character for an identifier", tempDchar); + } + } + else + storage[offset++] = *++priorVPtr; + } + + id = Identifier.idPool(storage[0 .. offset], false); + } + else + id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false); + t.ident = id; t.value = cast(TOK)id.getValue(); @@ -1174,9 +1356,11 @@ class Lexer if (c & 0x80) { c = decodeUTF(); - // Check for start of unicode identifier - if (isUniAlpha(c)) + + // Check for start of an identifier + if (charLookup.isStart(c)) goto case_ident; + if (c == PS || c == LS) { endOfLine(); @@ -1688,7 +1872,7 @@ class Lexer delimright = ']'; else if (c == '<') delimright = '>'; - else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) + else if (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c))) { // Start of identifier; must be a heredoc Token tok; @@ -1736,7 +1920,9 @@ class Lexer } else if (c == delimright) goto Ldone; - if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) + + // we're looking for a new identifier token + if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c))) && hereid) { Token tok; auto psave = p; @@ -2988,6 +3174,11 @@ class Lexer eSink.deprecation(loc, format, args); } + void warning(T...)(const ref Loc loc, const(char)* format, T args) + { + eSink.warning(loc, format, args); + } + void deprecation(T...)(const(char)* format, T args) { eSink.deprecation(token.loc, format, args); @@ -3416,124 +3607,6 @@ class Lexer } } - -/******************************* Private *****************************************/ - -private: - -private enum LS = 0x2028; // UTF line separator -private enum PS = 0x2029; // UTF paragraph separator - -/******************************************** - * Do our own char maps - */ -private static immutable cmtable = () -{ - ubyte[256] table; - foreach (const c; 0 .. table.length) - { - if ('0' <= c && c <= '7') - table[c] |= CMoctal; - if (c_isxdigit(c)) - table[c] |= CMhex; - if (c_isalnum(c) || c == '_') - table[c] |= CMidchar; - - switch (c) - { - case 'x': case 'X': - case 'b': case 'B': - table[c] |= CMzerosecond; - break; - - case '0': .. case '9': - case 'e': case 'E': - case 'f': case 'F': - case 'l': case 'L': - case 'p': case 'P': - case 'u': case 'U': - case 'i': - case '.': - case '_': - table[c] |= CMzerosecond | CMdigitsecond; - break; - - default: - break; - } - - switch (c) - { - case '\\': - case '\n': - case '\r': - case 0: - case 0x1A: - case '\'': - break; - default: - if (!(c & 0x80)) - table[c] |= CMsinglechar; - break; - } - } - return table; -}(); - -private -{ - enum CMoctal = 0x1; - enum CMhex = 0x2; - enum CMidchar = 0x4; - enum CMzerosecond = 0x8; - enum CMdigitsecond = 0x10; - enum CMsinglechar = 0x20; -} - -private bool isoctal(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMoctal) != 0; -} - -private bool ishex(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMhex) != 0; -} - -private bool isidchar(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMidchar) != 0; -} - -private bool isZeroSecond(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMzerosecond) != 0; -} - -private bool isDigitSecond(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMdigitsecond) != 0; -} - -private bool issinglechar(const char c) pure @nogc @safe -{ - return (cmtable[c] & CMsinglechar) != 0; -} - -private bool c_isxdigit(const int c) pure @nogc @safe -{ - return (( c >= '0' && c <= '9') || - ( c >= 'a' && c <= 'f') || - ( c >= 'A' && c <= 'F')); -} - -private bool c_isalnum(const int c) pure @nogc @safe -{ - return (( c >= '0' && c <= '9') || - ( c >= 'a' && c <= 'z') || - ( c >= 'A' && c <= 'Z')); -} - /******************************* Unittest *****************************************/ unittest |