aboutsummaryrefslogtreecommitdiff
path: root/gcc/cobol/scan_post.h
blob: dabb168ae90f579431d6c943d07714bcc248dae0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
/*
 * Copyright (c) 2021-2025 Symas Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following disclaimer
 *   in the documentation and/or other materials provided with the
 *   distribution.
 * * Neither the name of the Symas Corporation nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

static const char *
start_condition_str( int sc ) {
  const char *state = "???";
  switch(sc) {
  case INITIAL: state = "INITIAL"; break;
  case author_state: state = "author_state"; break;
  case basis: state = "basis"; break;
  case bool_state: state = "bool_state"; break;
  case cdf_state: state = "cdf_state"; break;
  case classify: state = "classify"; break;
  case copy_state: state = "copy_state"; break;
  case comment_entries: state = "comment_entries"; break;
  case date_state: state = "date_state"; break;
  case datetime_fmt: state = "datetime_fmt"; break;
  case dot_state: state = "dot_state"; break;
  case exception: state = "exception"; break;
  case field_level: state = "field_level"; break;
  case field_state: state = "field_state"; break;
  case function: state = "function"; break;
  case hex_state: state = "hex_state"; break;
  case ident_state: state = "ident_state"; break;
  case integer_count: state = "integer_count"; break;
  case name_state: state = "name_state"; break;
  case numeric_state: state = "numeric_state"; break;
  case numstr_state: state = "numstr_state"; break;
  case partial_name: state = "partial_name"; break;
  case picture: state = "picture"; break;
  case picture_count: state = "picture_count"; break;
  case procedure_div: state = "procedure_div"; break;
  case program_id_state: state = "program_id_state"; break;
  case quoted1: state = "quoted1"; break;
  case quoted2: state = "quoted2"; break;
  case quoteq: state = "quoteq"; break;
  case raising: state = "raising"; break;
  case subscripts: state = "subscripts"; break;
  case sort_state: state = "sort_state"; break;
  }
  return state;
}

static const char *
start_condition_is() { return start_condition_str( YY_START ); }

/*
 * Match datetime constants.
 *
 * A 78 or CONSTANT could have a special literal for formatted
 * date/time functions.
 */

static int
datetime_format_of( const char input[] ) {

  static const char date_fmt_b[] = "YYYYMMDD|YYYYDDD|YYYYWwwD";
  static const char date_fmt_e[] = "YYYY-MM-DD|YYYY-DDD|YYYY-Www-D";

  static const char time_fmt_b[] =
    "hhmmss([.,]s+)?|hhmmss([.,]s+)?Z|hhmmss([.,]s+)?[+]hhmm|";
  static const char time_fmt_e[] =
    "hh:mm:ss([.,]s+)?|hh:mm:ss([.,]s+)?Z|hh:mm:ss([.,]s+)?[+]hh:mm";

  static char date_pattern[ 3 * sizeof(date_fmt_e) ];
  static char time_pattern[ 3 * sizeof(time_fmt_e) ];
  static char datetime_pattern[ 6 * sizeof(time_fmt_e) ];

  static struct pattern_t {
    regex_t re;
    const char *regex;
    int token;
  } patterns[] = {
    { {}, datetime_pattern, DATETIME_FMT },
    { {}, date_pattern, DATE_FMT },
    { {}, time_pattern, TIME_FMT },
  }, * eopatterns = patterns + COUNT_OF(patterns);;

  // compile patterns
  if( ! date_pattern[0] ) {
    sprintf(date_pattern, "%s|%s", date_fmt_b, date_fmt_e);
    sprintf(time_pattern, "%s|%s", time_fmt_b, time_fmt_e);

    sprintf(datetime_pattern, "(%sT%s)|(%sT%s)",
            date_fmt_b, time_fmt_b,
            date_fmt_e, time_fmt_e);

    for( auto p = patterns; p < eopatterns; p++ ) {
      static const int cflags = REG_EXTENDED | REG_ICASE;
      static char msg[80];
      int erc;

      if( 0 != (erc = regcomp(&p->re, p->regex, cflags)) ) {
        regerror(erc, &p->re, msg, sizeof(msg));
        yywarn("%s:%d: %s: %s", __func__, __LINE__, keyword_str(p->token), msg);
      }
    }
  }

  // applies only in the datetime_fmt start condition
  if( datetime_fmt == YY_START ) {
    yy_pop_state();
    if( input == NULL ) return 0;

    // See if the input is a date, time, or datetime pattern string.
    static const int nmatch = 3;
    regmatch_t matches[nmatch];

    auto p = std::find_if( patterns, eopatterns,
                           [input, &matches]( auto& pattern ) {
                             auto erc = regexec( &pattern.re, input,
                                                 COUNT_OF(matches), matches, 0 );
                             return erc == 0;
                           } );

    return p != eopatterns? p->token : 0;
  }
  return 0;
}


/*
 * >>DEFINE, >>IF, and >>EVALUATE
 */

static bool
is_cdf_token( int token ) {
  switch(token) {
  case CDF_DEFINE:
  case CDF_DISPLAY:
  case CDF_IF:       case CDF_ELSE: case CDF_END_IF:
  case CDF_EVALUATE: case CDF_WHEN: case CDF_END_EVALUATE:
    return true;
  case CALL_COBOL:
  case CALL_VERBATIM:
  case COPY:
  case TURN:
    return true;
  }
  return false;
}

static bool
is_cdf_condition_token( int token ) {
  switch(token) {
  case CDF_IF:       case CDF_ELSE: case CDF_END_IF:
  case CDF_EVALUATE: case CDF_WHEN: case CDF_END_EVALUATE:
    return true;
  }
  return false;
}

/*
 * IF and EVALUATE are partially parsed in cdf.y.  ELSE and WHEN, etc., are
 * valid only in context.
 */
static bool
valid_conditional_context( int token ) {
  switch(token) {
  case CDF_DEFINE:
  case CDF_IF:
  case CDF_EVALUATE:
    return true;
  case CDF_ELSE:
  case CDF_END_IF:
    return scanner_token() == CDF_IF;
  case CDF_WHEN:
  case CDF_END_EVALUATE:
    return scanner_token() == CDF_EVALUATE;
  }
  return true; // all other CDF tokens valid regardless of context
}

static bool
run_cdf( int token ) {
  if( ! valid_conditional_context(token) ) {
    error_msg(yylloc, "CDF syntax error at '%s'", keyword_str(token));
    return false;
  }

  parsing.inject_token(token); // because it will be needed by CDF parser

  if( yy_flex_debug ) dbgmsg("CDF parser start with '%s'", keyword_str(token));

  parsing.parser_save(ydfparse);

  int erc = ydfparse();  // Parse the CDF directive.

  parsing.parser_restore();

  if( YY_START == cdf_state ) yy_pop_state();

  if( yy_flex_debug ) {
    dbgmsg("CDF parser returned %d, scanner SC <%s>", erc, start_condition_is());
  }

  return  0 == erc;
}

#include <queue>
struct pending_token_t {
  int token;
  YYSTYPE value;
  pending_token_t( int token, YYSTYPE value ) : token(token), value(value) {}
};
#define PENDING(T) pending_token_t( (T), yylval )

static std::queue<pending_token_t> pending_tokens;

int next_token() {
  int token = lexer();
  return token;
}

extern int ydfchar;
bool in_procedure_division(void);

// act on CDF tokens
int
prelex() {
  static bool in_cdf = false;
  int token = next_token();

  if( in_cdf ) { return token; }
  if( ! is_cdf_token(token) ) { return token; }

  in_cdf = true;

  assert(is_cdf_token(token));

  while( is_cdf_token(token) ) {

    if( ! run_cdf(token) ) {
      dbgmsg( ">>CDF parser failed" );
      return NO_CONDITION;
    }
    // Return the CDF's discarded lookahead token, if extant.
    token = ydfchar > 0? ydfchar : next_token();
    if( token == NO_CONDITION && parsing.at_eof() ) {
      return token = YYEOF;
    }

    // Reenter cdf parser only if next token could affect parsing state.
    if( ! parsing.on() && ! is_cdf_condition_token(token) ) break;
  }

  if( yy_flex_debug ) {
    dbgmsg("scanner SC <%s>", start_condition_is());
  }

  if( YY_START == copy_state || YY_START == cdf_state ) {
    if( token == NAME ) {
      auto tok = keyword_tok(ydflval.string);
      if( tok ) token = tok;
    }
    yy_pop_state();
    dbgmsg("scanner SC <%s>, token now %s",
           start_condition_is(), keyword_str(token));
  }

  /*
   * The final, rejected CDF token might be a LEVEL number.
   */
  if( YY_START == field_state && level_needed() ) {
    switch( token ) {
    case NUMSTR:
      if( yy_flex_debug ) yywarn("final token is NUMSTR");
      yylval.number = level_of(yylval.numstr.string);
      token = LEVEL;
      break;
    case YDF_NUMBER:
      if( yy_flex_debug ) yywarn("final token is YDF_NUMBER");
      yylval.number = ydflval.number;
      token = LEVEL;
      break;
    }
    if( token == LEVEL ) {
      switch(yylval.number) {
      case 66:
        token = LEVEL66;
        break;
      case 78:
        token = LEVEL78;
        break;
      case 88:
        token = LEVEL78;
        break;
      }
    }
  }

  dbgmsg( ">>CDF parser done, %s returning "
          "%s (because final_token %s, lookhead %d) on line %d", __func__,
          keyword_str(token), keyword_str(final_token),
          ydfchar, yylineno );
  in_cdf = false;
  return token;
}

/* There are 2 parsers and one scanner.
 * yyparse calls yylex.
 * yylex calls prelex
 *             prelex calls lexer, the scanner produced by flex.
 *                          lexer reads input from yyin via lexer_input.
 *
 * prelex intercepts CDF statements, each of which it parses with ydfparse.
 * ydfparse affects CDF variables, which may affect how yylex treats
 * the input stream.
 *
 * Because the lexer is called recursively:
 *
 *   yyparse -> yylex -> ydfparse -> yylex
 *
 * the global state of the scanner has changed when ydfparse returns.  Part of
 * that state is the unused lookahead token that ydfparse discarded, stored in
 * final_token.  prelex then returns final_token as its own, which is duly
 * returned to yyparse.
 */

int
yylex(void) {
  static bool produce_next_sentence_target = false;
  int token = parsing.pending_token();

  if( parsing.at_eof() ) return  YYEOF;
  if( token ) return token;

  /*
   * NEXT SENTENCE jumps to an implied CONTINUE at the next dot ('.').
   * Documentation says variously that the implied CONTINUE is before or after
   * that dot, but the meaning is one: after the statement that precedes the
   * dot.
   *
   * When the lexer encounters the dot, it returns it to the parser, which may
   * use it as a look-ahead token to decide the grammar production.  By the
   * time it returns to the lexer looking for its next token, the parser will
   * have taken whatever actions the dot decided.  At that point, the lexer
   * injects the label that NEXT SENTENCE jumps to.
   */
  if( produce_next_sentence_target ) {
    next_sentence_label(next_sentence);
    produce_next_sentence_target = false;
  }

  do {
    token = prelex();
    if( yy_flex_debug ) {
      if( parsing.in_cdf() ) {
        dbgmsg( "%s:%d: %s routing %s to CDF parser", __func__, __LINE__,
               start_condition_is(), keyword_str(token) );
      } else if( !parsing.on() ) {
        dbgmsg( "eating %s because conditional compilation is FALSE",
                 keyword_str(token) );
      }
    }

  } while( token && ! parsing.feed_a_parser() );

  if( next_sentence && token == '.' ) {
    produce_next_sentence_target = true;
  }

  if( parsing.normal() ) {
    final_token = token;
  }

  if( token == YYEOF && parsing.in_cdf() ) {
    if( yy_flex_debug) dbgmsg("deflecting EOF");
    parsing.at_eof(true);
    return NO_CONDITION;
  }

  return token;
}