From f400c8d27e11477d79ba67ca930ca7e7511b9ee5 Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Fri, 13 Aug 2021 17:20:10 +0930 Subject: ld lexer tidy, possibly break the world This tidies the states in which ld lexer rules are enabled. This change will quite likely trip over issues similar to those mentioned in the new ldlex.l comments, so please test it out. * ldgram.y (wildcard_name): Remove now unnecessary components. * ldlex.l: Restrict many rules' states. Remove -l expression state rule. Comment on lookahead state madness and need for /DISCARD/ in expression state. --- ld/ldgram.y | 10 --- ld/ldlex.l | 259 ++++++++++++++++++++++++++++++++---------------------------- 2 files changed, 140 insertions(+), 129 deletions(-) diff --git a/ld/ldgram.y b/ld/ldgram.y index 24979de..1f6c44a 100644 --- a/ld/ldgram.y +++ b/ld/ldgram.y @@ -421,21 +421,11 @@ statement_anywhere: lang_add_assignment (exp_assert ($4, $6)); } ; -/* The '*' and '?' cases are there because the lexer returns them as - separate tokens rather than as NAME. */ wildcard_name: NAME { $$ = $1; } - | '*' - { - $$ = "*"; - } - | '?' - { - $$ = "?"; - } ; wildcard_maybe_exclude: diff --git a/ld/ldlex.l b/ld/ldlex.l index b0861d7..6aeba6d 100644 --- a/ld/ldlex.l +++ b/ld/ldlex.l @@ -192,132 +192,155 @@ V_IDENTIFIER [*?.$_a-zA-Z\[\]\-\!\^\\]([*?.$_a-zA-Z0-9\[\]\-\!\^\\]|::)* } return INT; } -"]" { RTOKEN(']');} -"[" { RTOKEN('[');} -"<<=" { RTOKEN(LSHIFTEQ);} -">>=" { RTOKEN(RSHIFTEQ);} -"||" { RTOKEN(OROR);} -"==" { RTOKEN(EQ);} -"!=" { RTOKEN(NE);} -">=" { RTOKEN(GE);} -"<=" { RTOKEN(LE);} -"<<" { RTOKEN(LSHIFT);} -">>" { RTOKEN(RSHIFT);} -"+=" { RTOKEN(PLUSEQ);} -"-=" { RTOKEN(MINUSEQ);} -"*=" { RTOKEN(MULTEQ);} -"/=" { RTOKEN(DIVEQ);} -"&=" { RTOKEN(ANDEQ);} -"|=" { RTOKEN(OREQ);} -"&&" { RTOKEN(ANDAND);} -">" { RTOKEN('>');} -"," { RTOKEN(',');} -"&" { RTOKEN('&');} -"|" { RTOKEN('|');} -"~" { RTOKEN('~');} -"!" { RTOKEN('!');} -"?" { RTOKEN('?');} -"*" { RTOKEN('*');} -"+" { RTOKEN('+');} -"-" { RTOKEN('-');} -"/" { RTOKEN('/');} -"%" { RTOKEN('%');} -"<" { RTOKEN('<');} -"=" { RTOKEN('=');} + + /* Some tokens that only appear in expressions must be enabled for + states other than EXPRESSION, since parser lookahead means they + must be recognised before the parser switches the lexer out of + SCRIPT or WILD state into EXPRESSION state. + + This sort of thing happens for example with NAME in ldgram.y + "section" rule, which is immediately followed by ldlex_expression. + However, if you follow the grammar from "sec_or_group_p1" you see + "assignment" appearing in "statement_anywhere". Now, + "assignment" also has NAME as its first token, just like + "section". So the parser can't know whether it is in the + "section" or the "assignment" rule until it has scanned the next + token to find an assignment operator. Thus the next token after + NAME in the "section" rule may be lexed before the lexer is + switched to EXPRESSION state, and there are quite a number of + optional components. The first token in all those components + must be able to be lexed in SCRIPT state, as well as the + assignment operators. In fact, due to "opt_exp_with_type", + anything that can appear on the left hand side of "exp" might + need to be lexed in SCRIPT state. + + MRI mode tends to cover everything in MRI scripts. + */ +"]" { RTOKEN(']'); } +"[" { RTOKEN('['); } +"<<=" { RTOKEN(LSHIFTEQ); } +">>=" { RTOKEN(RSHIFTEQ); } +"||" { RTOKEN(OROR); } +"==" { RTOKEN(EQ); } +"!=" { RTOKEN(NE); } +">=" { RTOKEN(GE); } +"<=" { RTOKEN(LE); } +"<<" { RTOKEN(LSHIFT); } +">>" { RTOKEN(RSHIFT); } +"+=" { RTOKEN(PLUSEQ); } +"-=" { RTOKEN(MINUSEQ); } +"*=" { RTOKEN(MULTEQ); } +"/=" { RTOKEN(DIVEQ); } +"&=" { RTOKEN(ANDEQ); } +"|=" { RTOKEN(OREQ); } +"&&" { RTOKEN(ANDAND); } +">" { RTOKEN('>'); } +"," { RTOKEN(','); } +"&" { RTOKEN('&'); } +"|" { RTOKEN('|'); } +"~" { RTOKEN('~'); } +"!" { RTOKEN('!'); } +"?" { RTOKEN('?'); } +"*" { RTOKEN('*'); } +"+" { RTOKEN('+'); } +"-" { RTOKEN('-'); } +"/" { RTOKEN('/'); } +"%" { RTOKEN('%'); } +"<" { RTOKEN('<'); } +"=" { RTOKEN('='); } "}" { RTOKEN('}'); } "{" { RTOKEN('{'); } -")" { RTOKEN(')');} -"(" { RTOKEN('(');} +")" { RTOKEN(')'); } +"(" { RTOKEN('('); } ":" { RTOKEN(':'); } -";" { RTOKEN(';');} -