/*
 * Copyright (c) 2021-2025 Symas Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following disclaimer
 *   in the documentation and/or other materials provided with the
 *   distribution.
 * * Neither the name of the Symas Corporation nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include <ext/stdio_filebuf.h>
#include "cobol-system.h"
#include "cbldiag.h"
#include "util.h"
#include "copybook.h"
#include "lexio.h"

extern int yy_flex_debug;

source_format_t& cdf_source_format();

void
source_format_t::infer( const char *bol, bool want_reference_format ) {
  if( bol ) {
    left = 7;
    if( want_reference_format ) {
      right = 73;
    }
  }
  dbgmsg("%s:%d: %s format detected", __func__, __LINE__,
         description());
}


// public source format test functions
bool is_fixed_format() { return cdf_source_format().is_fixed(); }
bool is_reference_format() { return cdf_source_format().is_reffmt(); }

static bool debug_mode = false;

/*
 * The "debug mode" is a little odd, because we have to make sure a
 * leading "D" doesn't start the verb DISPLAY (for example).  If
 * debug_mode is on, debug lines are included in the parse.  If
 * debug_mode is off but we're not in fixed_format, lines starting
 * with "D" are also included.
 *
 * So, the line is excluded if: fixed format and not debug mode
 * Else, it's included.
*/

static inline int left_margin() {
  return cdf_source_format().left_margin();
}
static inline int right_margin() {
  return cdf_source_format().right_margin();
}

/*
 * When setting the indicator column explicity:
 *   To get strict fixed 72-column lines, use a negative column number.
 *   When setting back to 0 (free), the right margin is also reset to 0.
 */
void
cobol_set_indicator_column( int column );

bool include_debug()      { return is_fixed_format() && debug_mode; }
bool set_debug( bool tf ) { return debug_mode = tf && is_fixed_format(); }

static bool nonblank( const char ch ) { return !isblank(ch); }

static inline char *
start_of_line( char *bol, char *eol ) {
  bol = std::find_if(bol, eol, nonblank);
  gcc_assert(bol < eol); // must exist
  return bol;
}

static inline char *
continues_at( char *bol, char *eol ) {
  if( cdf_source_format().is_free() ) return NULL;  // cannot continue in free format
  bol += left_margin();
  if( *bol != '-' ) return NULL; // not a continuation line
  return start_of_line(++bol, eol);
}

// Return pointer to indicator column. Test ch if provided.
// NULL means no indicator column or tested value not present.
static inline char *
indicated( char *bol, const char *eol, char ch = '\0' ) {
  if( cdf_source_format().left_margin() == 0 && *bol != '*' ) {
    return NULL;  // no indicator column in free format, except for comments
  }
  gcc_assert(bol != NULL);
  auto ind = bol + left_margin();
  if( eol <= ind ) return NULL; // left margin would be after end of line
  // If TAB is in the line-number region, nothing is in the indicator column.
  bool has_tab = std::any_of(bol, ind,
                             [](const char ch) { return ch == '\t'; } );
  if( has_tab ) return NULL;
  if( (bol += left_margin()) > eol ) return NULL;
  return ch == '\0' || ch == *bol? bol : NULL;
}

static char *
remove_inline_comment( char *bol, char *eol ) {
  char *nl = std::find(bol, eol, '\n');

  if( bol < nl ) {
    static char ends = '\0';
    std::swap(*nl, ends);
    char *comment = strstr(bol, "*>");
    if( comment ) {
      std::fill(comment, nl, SPACE);
    }
    std::swap(*nl, ends);
  }
  return eol;
}

static void
erase_line( char *src, char *esrc ) {
  dbgmsg( "%s: erasing %.*s from input", __func__, int(esrc-src), src);
  erase_source(src, esrc);
}

static size_t
count_newlines( const char *beg, const char *end ) {
  return std::count(beg, end, '\n');
}

size_t
filespan_t::tab_check( const char *src, const char *esrc ) {
  static const char tab = '\t';

  const char *data = src + left_margin();
  if( data < esrc ) { // not a blank line
    const char *tab_at = std::find(src, data, tab);
    if( tab_at < data ) {
      return (tab_at - src) + 1;
    }
  }
  return 0;
}

static const auto extended_icase = regex::extended | regex::icase;

std::stack< std::list<replace_t> > replace_directives;

static bool
is_word_or_quote( char ch ) {
  return ch == '"' || ch == '\'' || ISALNUM(ch);
}
/*
 * If the replacement is not leading/trailing, the edges of the
 * matched pattern must delimit a Cobol word.  If not, add a space to
 * the replacement.
 */
static void
maybe_add_space(const span_t& pattern, replace_t& recognized) {
  static const char blank[] = " ";
  const char *befter[2] = { "", "" };
  gcc_assert(0 < recognized.before.size());

  // start of pattern and end of preceding text
  if( pattern.p[0] == '.' && is_word_or_quote(recognized.before.p[-1]) ) {
    befter[0] = blank;
  }
  // end of pattern, and start of succeeding text
  if( pattern.pend[-1] == '.' && is_word_or_quote(recognized.before.pend[0]) ) {
    befter[1] = blank;
  }

  if( befter[0] == blank || befter[1] == blank ) {
    const char *s = xasprintf( "%s%.*s%s",
                               befter[0],
                               recognized.after.size(), recognized.after.p,
                               befter[1] );
    recognized.after = span_t(s, s + strlen(s));
  }
}

/*
 * Keep track of the next pending replacement for each active REPLACE
 * directive.  For the current line, apply patterns that begins on the
 * line.  (It may match input extending beyond the current eol.)
 *
 * As each replacement is identified, append it to the passsed list of
 * pending replacements.  For these elements:
 *
 *     before is a span in mfile
 *     after is dynamically allocated
 */
static void
recognize_replacements( filespan_t mfile, std::list<replace_t>& pending_replacements ) {
  static const char *top_of_stack_cache, *applies_to;

  struct future_replacement_t {
    replace_t directive;
    span_t found;
    future_replacement_t( const replace_t& replace, span_t found )
      : directive(replace), found(found)
    {}
    bool operator<( const future_replacement_t& that ) const {
      return found.p < that.found.p;
    }
  };

  static std::list<future_replacement_t> futures;

  if( replace_directives.empty() ) return;

  if( ! (top_of_stack_cache == replace_directives.top().front().before.p
         &&
         applies_to == mfile.data) )
  {
    futures.clear();
    top_of_stack_cache = replace_directives.top().front().before.p;
    applies_to = mfile.data;
  }

  if( futures.empty() ) {
    /*
     * From the current point in the file, find the next match for each
     * pattern at the top of the replacement stack.
     */
    for( const auto& directive : replace_directives.top() ) {
      regex re(directive.before.p, extended_icase);
      cmatch cm;

      span_t found(mfile.eodata, mfile.eodata);

      if( regex_search( mfile.ccur(),
                        const_cast<const char *>(mfile.eodata),
                        cm, re) ) {
        gcc_assert(cm[1].matched);
        found = span_t( cm[1].first, cm[1].second );
        if( yy_flex_debug ) {
          size_t n = count_newlines(mfile.data, found.p);
          dbgmsg("%s:%d first '%.*s' is on line " HOST_SIZE_T_PRINT_UNSIGNED
                 " (offset " HOST_SIZE_T_PRINT_UNSIGNED ")", __func__, __LINE__,
                 directive.before.size(), directive.before.p,
                 (fmt_size_t)++n, (fmt_size_t)(found.p - mfile.data));
        }
      } else {
        dbgmsg("%s:%d not found: '%s' in \n'%.*s'", __func__, __LINE__,
              directive.before.p, int(strlen(directive.before.p)), mfile.cur);
      }
      futures.push_back( future_replacement_t(directive, found) );
    }
  }

  gcc_assert(!futures.empty());
  gcc_assert(futures.size() == replace_directives.top().size());

  replace_t recognized;

  auto pnext = std::min_element(futures.begin(), futures.end());

  for( const char *bol = mfile.cur; // more than one replacement may apply to a line
       bol <= pnext->found.p && pnext->found.p < mfile.eol; ) {
    auto& next(*pnext);
    recognized = replace_t( next.found, span_t(strlen(next.directive.after.p),
                                               next.directive.after.p) );
    maybe_add_space(next.directive.before, recognized);
    pending_replacements.push_back(recognized);
    bol = next.found.pend;

    if( yy_flex_debug ) {
      size_t n = std::count(const_cast<const char *>(mfile.data),
                            recognized.before.p, '\n');
      dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
              " @ " HOST_SIZE_T_PRINT_UNSIGNED ": '%s'\n/%.*s/%.*s/",
              __func__, __LINE__,
              (fmt_size_t)++n, (fmt_size_t)(next.found.p - mfile.data),
              next.directive.before.p,
              int(recognized.before.size()), recognized.before.p,
              int(recognized.after.size()), recognized.after.p );
    }

    // Update the futures element for this pattern
    cmatch cm;

    next.found = span_t(mfile.eodata, mfile.eodata);

    regex re(next.directive.before.p, extended_icase);
    if( regex_search(bol, const_cast<const char *>(mfile.eodata), cm, re) ) {
      gcc_assert(cm[1].matched);
      next.found = span_t( cm[1].first, cm[1].second );
      size_t n = std::count(const_cast<const char *>(mfile.data),
                            next.found.p, '\n');
      if( false )
        dbgmsg("%s:%d next '%.*s' will be on line " HOST_SIZE_T_PRINT_UNSIGNED
               " (offset " HOST_SIZE_T_PRINT_UNSIGNED ")", __func__, __LINE__,
               next.directive.before.size(), next.directive.before.p,
               (fmt_size_t)++n, (fmt_size_t)(next.found.p - mfile.data));
    }
    pnext = std::min_element(futures.begin(), futures.end());
  }
}

static void
check_push_pop_directive( filespan_t& mfile ) {
  char eol = '\0';
  const char *p = std::find(mfile.cur, mfile.eol, '>');
  if( ! (p < mfile.eol && p[1] == *p ) ) return;

  const char pattern[] =
    ">>[[:blank:]]*(push|pop)[[:blank:]]+"
    "("
      "all|"
      "call-convention|"
      "cobol-words|"
      "define|"
      "source[[:blank:]]+format|"
      "turn"
    ")";
  static regex re(pattern, extended_icase);

  // show contents of marked subexpressions within each match
  cmatch cm;

  std::swap(*mfile.eol, eol); // see implementation for excuses
  bool ok = regex_search(p, const_cast<const char *>(mfile.eol), cm, re);
  std::swap(*mfile.eol, eol);

  if( ok ) {
    gcc_assert(cm.size() > 1);
    bool push = TOUPPER(cm[1].first[1]) == 'U';
    switch( TOUPPER(cm[2].first[0]) ) {
    case 'A': // ALL
      push? cdf_push() : cdf_pop();
      break;
    case 'C':
      switch( TOUPPER(cm[2].first[1]) ) {
      case 'A': // CALL-CONVENTION
        push? cdf_push_call_convention() : cdf_pop_call_convention();
        break;
      case 'O': // COBOL-WORDS
        push? cdf_push_current_tokens() : cdf_pop_current_tokens();
        break;
      default:
        gcc_unreachable();
      }
      break;
    case 'D': // DEFINE
      push? cdf_push_dictionary() : cdf_pop_dictionary();
      break;
    case 'S': // SOURCE FORMAT
      push? cdf_push_source_format() : cdf_pop_source_format();
      break;
    case 'T': // TURN
      push? cdf_push_enabled_exceptions() : cdf_pop_enabled_exceptions();
      break;
    default:
      gcc_unreachable();
    }
    erase_line(const_cast<char*>(cm[0].first),
               const_cast<char*>(cm[0].second));
  }
}

static void
check_source_format_directive( filespan_t& mfile ) {
  char eol = '\0';
  const char *p = std::find(mfile.cur, mfile.eol, '>');
  if( ! (p < mfile.eol && p[1] == *p ) ) return;

  const char pattern[] =
    ">>[[:blank:]]*source[[:blank:]]+"
    "(format[[:blank:]]+)?"
    "(is[[:blank:]]+)?"
    "(fixed|free)";
  static regex re(pattern, extended_icase);

  // show contents of marked subexpressions within each match
  cmatch cm;

  std::swap(*mfile.eol, eol); // see implementation for excuses
  bool ok = regex_search(p, const_cast<const char *>(mfile.eol), cm, re);
  std::swap(*mfile.eol, eol);

  if( ok ) {
    gcc_assert(cm.size() > 1);
    switch( cm[3].length() ) {
    case 4:
      cobol_set_indicator_column(0);
      break;
    case 5:
      cobol_set_indicator_column(-7);
      break;
    default:
      gcc_assert(cm[3].length() == 4 || cm[3].length() == 5);
      break;
    }

    dbgmsg( "%s:%d: %s format set, on line " HOST_SIZE_T_PRINT_UNSIGNED,
            __func__, __LINE__,
            cdf_source_format().description(),
            (fmt_size_t)mfile.lineno() );
    char *bol = cdf_source_format().is_fixed()? mfile.cur : const_cast<char*>(cm[0].first);
    gcc_assert(cm[0].second <= mfile.eol);
    erase_line(bol, const_cast<char*>(cm[0].second));
  }
}

struct buffer_t : public bytespan_t {
  char *pos;    // current output position

  buffer_t( char *data, char *eodata )
    : bytespan_t(data, eodata)
    , pos(data)
  {
    if(pos) *pos = '\0';
  }

  size_t nline() const {
    gcc_assert(data <= pos);
    return std::count(data, pos, '\n');
  }
  size_t free_space() const { gcc_assert(pos <= eodata); return eodata - pos; }

  bool pad_lines( size_t goal ) {
    while( nline() < goal ) {
      if( pos == eodata ) return false;
      *pos++ = '\n';
    }
    return true;
  }

  void show() const {
    gcc_assert(data <= pos);
    dbgmsg("flex input buffer: '%.*s'\n[xelf]", int(pos - data), data);
  }
  void dump() const {
#ifdef GETENV_OK
    if( getenv("lexer_input") ) show();
#endif
  }
};

static inline bool is_p( char ch ) { return TOUPPER(ch) == 'P'; }

static bool
is_program_id( const char *p, const char *eol ) {
  static const std::string program_id("PROGRAM-ID");
  auto eop = p + program_id.size();
  if( eop < eol ) {
    // PROGRAM-ID must be followed by a dot, perhaps with intervening whitespace.
    for( const char *dot=eop; dot < eol && *dot != '.'; dot++ ) {
      if( !ISSPACE(*dot) ) return false;
    }
    std::string line (p, eop);
    std::transform(line.begin(), line.end(), line.begin(), ::toupper);
    return line == program_id;
  }
  return false;
}

const char * esc( size_t len, const char input[] );

static bool
is_word_char( char ch ) {
  switch(ch) {
  case '$':
  case '-':
  case '_':
    return true;
  }
  return ISALNUM(ch);
}

static bool
is_numeric_char( char ch ) {
  return ISDIGIT(ch)
    || TOUPPER(ch) == 'E'
    || ch == '.'
    || ch == ','
    ;
}

static bool
is_numeric_term( span_t term ) {
  gcc_assert(term.p);
  if( term.p[0] == '+' || term.p[0] == '-' ) term.p++;
  auto p = std::find_if( term.p, term.pend,
                         []( char ch ) {
                           return ! is_numeric_char(ch);
                         } );
  return p == term.pend;
}

struct replacing_term_t {
  bool matched, done;
  span_t leading_trailing, term, stmt;

  explicit replacing_term_t(const char input[])
    : matched(false), done(false), stmt(span_t(input, input))
    {}
};

extern YYLTYPE yylloc;

static const char *
last_newline (const char *p, const char *pend ) {
  size_t len = pend - p;
  return static_cast<const char *>( memrchr( p, '\n', len ) );
}
/*
 * For some statement parsed with regex_search, set yyloc to indicate the line
 * and column spans of the term.  Assume stmt begins at the start of a line.
 */
static void
update_yylloc( const csub_match& stmt, const csub_match& term ) {
  gcc_assert(stmt.first <= term.first && term.second <= stmt.second);

  class dump_loc_on_exit {
   public:
    dump_loc_on_exit() {
      if( gcobol_getenv( "update_yylloc" ) )
        location_dump( "update_yylloc", __LINE__, "begin", yylloc);
    }
    ~dump_loc_on_exit() {
      if( gcobol_getenv( "update_yylloc" ) )
        location_dump( "update_yylloc", __LINE__, "end  ", yylloc);
    }
  } dloe;

  size_t nline = std::count( stmt.first, term.second, '\n' );
  size_t n = std::count( term.first, term.second, '\n' );

  if( nline ) {
    yylloc.last_line += nline;
    yylloc.first_line = yylloc.last_line - n;
  }

  /*
   * Set the column span for the term.
   */
  const char *p = last_newline(stmt.first, stmt.second);
  if( !p ) { // no newlines in entire statement
    yylloc.first_column = (term.first - stmt.first) + 1;
    yylloc.last_column =  (term.second - stmt.first) + 1;
    return;
  }

  p = last_newline(stmt.first, term.first);
  if( !p ) { // no newlines before term
    yylloc.first_column = (term.first - stmt.first) + 1;
    p = last_newline(term.first, term.second);
    gcc_assert(p); // newline must be in term
    yylloc.last_column = (term.second - p) + 1;
    return;
  }

  const char *bol = p; // bol points to last newline before term

  yylloc.first_column = term.first - bol;
  p = last_newline(term.first, term.second);
  if( p ) { // term has newlines, too
    yylloc.last_column =  (p - term.first);
  } else {
    yylloc.last_column = yylloc.first_column + term.length();
  }
}

static replacing_term_t
parse_replacing_term( const char *stmt, const char *estmt ) {
  gcc_assert(stmt); gcc_assert(estmt); gcc_assert(stmt <= estmt);
  replacing_term_t output(stmt);

  static const char pattern[] =
    "^([[:space:];,]+(LEADING|TRAILING|BY))?"    // 1, 2
    "[[:space:];,]+" // leading space between pairs
    "("                                        // 3
            "(\""  "([\"]{2}|[^\"])*" "\")"    // 4, 5
        "|" "('"   "([']{2}|[^'])*"   "')"     // 6, 7
        "|" "("    "[+-]?[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")" // 8, 9
        "|" "(==(" "(=?[^=]+)*" ")==)"         // 10, 11, 12
    ")"
    "(([[:space:];,]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:];,]*([.]))?"  // 13, 14, 15
    ;

  static regex re(pattern, extended_icase);
  cmatch cm;

  if( ! regex_search( stmt, estmt, cm, re) ) return output;

  bool replacing_term = cm[2].matched && TOUPPER(cm[2].first[0]) == 'B';

  if( cm[2].matched && ! replacing_term ) {
    output.leading_trailing = cm[2];
  }

  // Apply such that quoted matches supersede word matches.
  if( cm[11].matched ) output.term = cm[11];
  if( cm[ 8].matched ) output.term = cm[ 8];
  if( cm[ 6].matched ) output.term = cm[ 6];
  if( cm[ 4].matched ) output.term = cm[ 4];

  // The matched segment extends to the end of the matched term, or to
  // the dot at end of statement.  Include the pseudotext ==, if found.
  output.stmt = span_t(cm[0].first, output.term.pend);
  if( cm[10].matched ) output.stmt.pend = cm[10].second;

  if( cm[15].matched && ISSPACE(cm[15].second[0]) ) { // matched end of statement
    output.done = output.matched = true;
    output.stmt = cm[0];
    gcc_assert(output.stmt.pend[-1] == '.');
    dbgmsg("%s:%d: done at '%.*s'", __func__, __LINE__,
           output.term.size(), output.term.p);
    return output;
  }

  if( is_numeric_term(output.term) ) {
    output.matched = output.stmt.p < output.term.p;
    gcc_assert(output.matched);
    // look for fractional part
    if( is_numeric_char(*output.term.pend) && ISDIGIT(output.term.pend[1]) ) {
      gcc_assert(!ISDIGIT(*output.term.pend));
      auto p = std::find_if(++output.term.pend, estmt,
                            []( char ch ) { return !ISDIGIT(ch); } );
      output.stmt.pend = output.term.pend = p;
      output.done = '.' == output.stmt.pend[0] && ISSPACE(output.stmt.pend[1]);
      if( output.done ) output.stmt.pend++;
    }
    dbgmsg("%s:%d: %s '%.*s'", __func__, __LINE__,
           output.done? "done at" : "term is",
           output.term.size(), output.term.p);
    return output;
  }

  if( yy_flex_debug ) { // should be looking only for words
    dbgmsg("%s:%d: not done, working with '%.*s'", __func__, __LINE__,
          cm[0].length(), cm[0].first);
    int i=0;
    for( auto m : cm ) {
      if( m.matched )
        dbgmsg("%4d) '%.*s'", i, m.length(), m.first);
      i++;
    }
  }

  if( !cm[8].matched ) {
    output.matched = output.stmt.p < output.term.p;
    gcc_assert(output.matched);
    dbgmsg("%s:%d: term is '%.*s'", __func__, __LINE__,
           output.term.size(), output.term.p);
    return output;
  }

  bool extraneous_replacing = 'R' == TOUPPER(cm[8].first[0]); // maybe
  if( extraneous_replacing ) { // prove it
    static const char replacing[] = "REPLACING";
    for( size_t i=0; i < strlen(replacing); i++ ) {
      if( replacing[i] != TOUPPER(cm[8].first[i]) ) {
        extraneous_replacing = false;
        break;
      }
    }
    if( extraneous_replacing ) {
      update_yylloc( cm[0], cm[8] );
      cbl_message(LexReplaceE, "syntax error: invalid '%.*s'",
                  cm[8].length(), cm[8].first);
      output.matched = false;
      return output;
    }
  }

  gcc_assert(cm[8].matched);
  gcc_assert(0 < output.term.size());

  dbgmsg("%s:%d: more words starting at '%.80s'", __func__, __LINE__,
         output.term.pend);

  static const char term_pattern[] =
    "^[[:space:]]+"
    "(" "(IN|OF)[[:space:]]+" ")"                            // 1, 2
    "(" "[+-]?[[:alnum:]]+([$_-]+[[:alnum:]]+)*" ")"         // 3, 4
    "(" "[[:space:]]*[(]" ")?"                               // 5
    "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 6, 7, 8
    ;
  static const char paren_pattern[] =
    "^[[:space:]]*"
    "(" "[()][^()]*[()]" ")"    // 1
    "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 2, 3, 4
    ;

  regex term_re(term_pattern, extended_icase);
  regex paren_re(paren_pattern, extended_icase);
  ssize_t nsub = 0;

  while( regex_search( output.term.pend, estmt, cm, term_re) ) {
    output.stmt.pend = output.term.pend = cm[3].second; // found a word
    if( cm[5].matched ) break; // found left parenthesis
    const csub_match& done(cm[8]);
    if( done.matched ) {
      output.done = output.matched = output.stmt.p < output.term.p;
      gcc_assert(output.done);
      goto matched;
    }
  }

  // match subscripts, if any
  while( regex_search( output.term.pend, estmt, cm, paren_re) ) {
    output.stmt.pend = output.term.pend = cm[1].second;
    if( cm[1].first[0]   == '(' ) nsub++;
    if( cm[1].first[0]   == ')' ) nsub--;
    if( cm[1].second[-1] == '(' ) nsub++;
    if( cm[1].second[-1] == ')' ) nsub--;

    const csub_match& done(cm[4]);
    if( done.matched ) {
      output.matched = output.stmt.p < output.term.p;
      output.stmt.pend = done.second;
      output.done = output.stmt.pend[-1] == '.';
      goto matched;
    }

    if( nsub == 0 ) break;
  }

 matched:
  output.matched = output.stmt.p < output.term.p;

  if( yy_flex_debug ) {
    const char *status = "unmatched";
    if( output.matched ) status = output.done? "done" : "matched";
    dbgmsg("%s:%d: %s term is '%.*s'", __func__, __LINE__, status,
           output.term.size(), output.term.p? output.term.p : "");
  }
  return output;
}

struct replacing_pair_t {
  span_t leading_trailing, stmt;
  replace_t replace;

  bool matched() const { return 0 < stmt.size(); }
  bool done() const { return matched() && stmt.pend[-1] == '.'; }
};
static replacing_pair_t
parse_replacing_pair( const char *stmt, const char *estmt ) {
  replacing_pair_t pair;

  pair.replace = replace_t();
  auto parsed = parse_replacing_term( stmt, estmt ); // before
  if( parsed.matched ) {
    if( parsed.term.size() == 0 ) return pair; // failure: empty before string
    pair.leading_trailing = parsed.leading_trailing;
    pair.stmt = parsed.stmt;
    pair.replace.before = parsed.term;

    if( !parsed.done ) {
      parsed = parse_replacing_term( pair.stmt.pend, estmt ); // after
      if( parsed.matched ) {
        pair.stmt.pend = parsed.stmt.pend;
        pair.replace.after = parsed.term;
      } else {
        dbgmsg("%s:%d: not matched '%.*s'", __func__, __LINE__,
               pair.stmt.size(), pair.stmt.p);
      }
    }
    if( yy_flex_debug ) {
      const char *status = "unmatched";
      if( pair.matched() ) status = pair.done()? "done" : "matched";
      dbgmsg("%s:%d: [%s] replacing '%.*s' with '%.*s'", __func__, __LINE__,
             status,
             pair.replace.before.size(), pair.replace.before.p,
             pair.replace.after.size(), pair.replace.after.p);
    }
  } else {
    for( auto p = stmt; (p = std::find(p, estmt, '.')) < estmt; p++ ) {
      if( ISSPACE(p[1]) ) {
        pair.stmt = span_t(stmt, ++p);
        break;
      }
    }
    if( pair.stmt.p ) {
      cbl_message(LexReplaceE, "LEX syntax error '%.*s'", (int)pair.stmt.size(), pair.stmt.p);
    }
    else {
      // This eliminated a compiler warning about "format-overflow"
      cbl_message(LexReplaceE, "LEX syntax error");
    }
    pair.stmt = span_t(size_t(0), stmt);
    pair.replace = replace_t();
  }
  return pair;
}

static std::pair<std::list<replace_t>, char *>
parse_replace_pairs( const char *stmt, const char *estmt, bool is_copy_stmt ) {
  std::list<replace_t> pairs ;

  static const char     any_ch[] = ".";
  static const char    word_ch[] = "[[:alnum:]$_-]";
  static const char nonword_ch[] = "[^[:alnum:]\"'$_-]";

  // Pattern to find one REPLACE pseudo-text pair
  static const char replace_pattern[] =
    "([[:space:]]+(LEADING|TRAILING))?"    // 1, 2
    "[[:space:]]+"
    "==(" "(=?[^=]+)+" ")=="               // 3, 4
    "[[:space:]]+BY[[:space:]]+"
    "==(" "(=?[^=]+)*" ")=="               // 5, 6
    "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 7, 8, 9
    ;

  regex pair_re(replace_pattern, extended_icase);
  cmatch cm;
  replacing_pair_t parsed;
  bool end_of_stmt = false;

  for( auto p = stmt; p < estmt && !end_of_stmt; p = parsed.stmt.pend ) {
    if( is_copy_stmt ) {
      parsed = parse_replacing_pair(p, estmt);
      if( parsed.replace.before.size() == 0 ) break; // empty before
      if( parsed.replace.after.p == NULL ) break;    // invalid after
      end_of_stmt = parsed.done();
    } else {
      if( ! regex_search( p, estmt, cm, pair_re) ) break;
      // Report findings.
      if( false && yy_flex_debug ) {
        for( size_t i=0; i < cm.size(); i++ ) {
          dbgmsg("%s: %s " HOST_SIZE_T_PRINT_UNSIGNED ": '%.*s'", __func__,
                 cm[i].matched? "Pair" : "pair",
                 (fmt_size_t)i,
                 cm[i].matched? int(cm[i].length()) : 0,
                 cm[i].matched? cm[i].first : "");
        }
      }
      gcc_assert(cm[3].matched);
      gcc_assert(cm[5].matched);
      parsed.leading_trailing = cm[2];
      parsed.replace.before   = cm[3];
      parsed.replace.after    = cm[5];

      parsed.stmt = cm[0];
      // If not done, exclude trailing portion from statement match.
      if( !parsed.done() && cm[8].matched ) {
        gcc_assert(!cm[9].matched);
        parsed.stmt.pend = cm[8].first;
      }
    }

    span_t& before(parsed.replace.before);
    const span_t& after(parsed.replace.after);

    const char *befter[2] = { nonword_ch, nonword_ch };
    gcc_assert(before.p < before.pend);
    if( !is_word_char(before.p[0]) )     befter[0] = any_ch;
    if( !is_word_char(before.pend[-1]) ) befter[1] = any_ch;

    const char *src = esc(before.size(), before.p);

    if( parsed.leading_trailing.size() > 0 ) {
      switch( TOUPPER(parsed.leading_trailing.p[0]) ) {
      case 'L': // leading
        befter[1] = word_ch;
        break;
      case 'T': // trailing
        befter[0] = word_ch;
        break;
      default:
        gcc_unreachable();
      }
      dbgmsg("%s:%d: dealing with %.*s", __func__, __LINE__,
             int(parsed.leading_trailing.size()), parsed.leading_trailing.p);
    }

    src = xasprintf("%s(%s)%s", befter[0], src, befter[1]);

    struct {  span_t before, after; }  output;
    output.before = span_t(strlen(src), src);
    output.after = after.dup();

    gcc_assert(!before.has_nul());
    pairs.push_back( replace_t( output.before, output.after ) );

    // COPY REPLACING matches end-of-statment here
    // REPLACE matched end-of-statement in caller, and estmt[-1] == '.'
    if( is_copy_stmt && parsed.stmt.pend[-1] == '.' ) break;
  }

  if( yy_flex_debug ) {
    dbgmsg( "%s:%d: %s: " HOST_SIZE_T_PRINT_UNSIGNED " pairs parsed from  '%.*s'",
            __func__, __LINE__,
            parsed.done() ? "done" : "not done",
            (fmt_size_t)pairs.size(), parsed.stmt.size(),
            parsed.stmt.size() ? parsed.stmt.p : "" );
    int i = 0;
    for( const auto& replace : pairs ) {
      dbgmsg("%s:%d:%4d: '%s' => '%s'", __func__, __LINE__,
             ++i, replace.before.p, replace.after.p);
    }
  }
  if( !parsed.done() ) {
    pairs.clear();
    return std::make_pair(pairs, const_cast<char*>(stmt));
  }

  return std::make_pair(pairs, const_cast<char*>(parsed.stmt.pend));
}

struct copy_descr_t {
  bool parsed;
  int fd;
  size_t nreplace;
  span_t partial_line, erased_lines;

  copy_descr_t( const char *line, const char *eol)
    : parsed(false), fd(-1), nreplace(0), partial_line(line, eol) {}
};

static YYLTYPE
location_in( const filespan_t& mfile, const csub_match& cm ) {
  YYLTYPE loc {
    int(mfile.lineno() + 1), int(mfile.colno() + 1),
    int(mfile.lineno() + 1), int(mfile.colno() + 1)
  };
  gcc_assert(mfile.cur <= cm.first && cm.second <= mfile.eodata);
  auto nline = std::count(cm.first, cm.second, '\n');
  if( nline ) {
    gcc_assert(nline < loc.first_line);
    loc.first_line -= nline;
    auto p = static_cast<const char*>(memrchr(cm.first, '\n', cm.length()));
    loc.last_column = (cm.second) - p;
  }
  location_dump(__func__, __LINE__, "copy?", loc);
  return loc;
}

static copy_descr_t
parse_copy_directive( filespan_t& mfile ) {
  static const char *most_recent_buffer;
  static span_t copy_stmt(mfile.eodata, mfile.eodata);

  static const char pattern[] =
    "COPY" "[[:space:]]+"
    /* 1 */    "("
    /*2,3*/            "\"("  "([\"]{2}|[^\"])+" ")\""
    /*4,5*/        "|" "'("   "([']{2}|[^'])+"   ")[']"
    /*6,7*/        "|" "("    "[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")"
    /*   */    ")"
    /* 8 */    "("
    /* 9 */        "[[:space:]]+(OF|IN)[[:space:]]+"
    /* 10*/        "("
    /*11,12*/              "(\""  "([\"]{2}|[^\"])+" "\")"
    /*13,14*/          "|" "('"   "([']{2}|[^'])+"   "')"
    /*15,16*/          "|" "("    "[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")"
    /*   */        ")"
    /*   */    ")?"
    /*17,18*/  "([[:space:]]+SUPPRESS([[:space:]]+PRINTING)?)?"
    /*19,20 */ "(" "([[:space:]]*[.])" "|" "[[:space:]]+REPLACING" ")"
    ;

  static regex re(pattern, extended_icase);
  cmatch cm;
  copy_descr_t outcome(mfile.cur, mfile.cur);

  // COPY appears in current buffer?
  if( most_recent_buffer != mfile.data || copy_stmt.p < mfile.cur ) {
    most_recent_buffer = mfile.data;
    copy_stmt.p = mfile.eodata;

    if( regex_search(mfile.ccur(),
                     const_cast<const char *>(mfile.eodata), cm, re) ) {
      copy_stmt = span_t( cm[0].first, cm[0].second );
      if( yy_flex_debug ) {
        size_t nnl = 1 + count_newlines(mfile.data, copy_stmt.p);
        size_t nst = 1 + count_newlines(copy_stmt.p, copy_stmt.pend);
        dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
               ": COPY directive is " HOST_SIZE_T_PRINT_UNSIGNED " lines '%.*s'",
               __func__, __LINE__,
               (fmt_size_t)nnl, (fmt_size_t)nst, copy_stmt.size(), copy_stmt.p);
      }
    }
  }

  // If COPY appears on the current line, parse it completely this time.
  if( mfile.cur <= copy_stmt.p &&
                   copy_stmt.p < mfile.eol ) {
    outcome.parsed = regex_search(copy_stmt.p, copy_stmt.pend, cm, re);
    gcc_assert(outcome.parsed);
    outcome.partial_line = span_t(mfile.cur, copy_stmt.p);

    if( yy_flex_debug ) {
      dbgmsg(HOST_SIZE_T_PRINT_UNSIGNED " expressions",
             (fmt_size_t)std::count(pattern, pattern + sizeof(pattern), '('));
      int i = 0;
      for( const auto& m : cm ) {
        if( m.matched )
          dbgmsg("%s:%d: %2d: '%.*s'", __func__, __LINE__,
                i, int(m.length()), m.first);
        i++;
      }
    }

    auto& copybook_name = cm[1];
    auto& library_name = cm[10];

    bool replacing = !cm[20].matched;

    if( library_name.matched ) {
      YYLTYPE loc = location_in( mfile, library_name );
      copybook.library( loc, xstrndup(library_name.first, library_name.length()) );
    }
    YYLTYPE loc = location_in( mfile, copybook_name );
    outcome.fd = copybook.open( loc, xstrndup(copybook_name.first,
                                              copybook_name.length()) );
    if( outcome.fd == -1 ) { // let parser report missing copybook
      dbgmsg("%s:%d: (no copybook '%s' found)", __func__, __LINE__, copybook.source());
      return outcome;
    }

    if( replacing ) {
      std::pair<std::list<replace_t>, char*>
        result = parse_replace_pairs( cm[0].second, mfile.eodata, true );

      const std::list<replace_t>& replacements(result.first);
      outcome.parsed = (outcome.nreplace = replacements.size()) > 0;
      if( outcome.parsed ) {
        replace_directives.push(replacements);
      }
      copy_stmt.pend = result.second;

      // Maybe we don't need these.  We'll see.
      for( const auto& r : replacements ) {
        copybook.replacement(pseudo_e, r.before.dup().p, r.after.dup().p);
      }
    }

    // If the parse failed, pass it through to the parser for analysis.
    if( outcome.parsed ) {
      erase_line( const_cast<char*>(copy_stmt.p),
                  const_cast<char*>(copy_stmt.pend));
      outcome.erased_lines = copy_stmt;
    }

    mfile.eol = const_cast<char*>(copy_stmt.pend);
    mfile.next_line();
  }
  return outcome;
}

static char *
parse_replace_last_off( const filespan_t& mfile ) {
  static const char pattern[] =
    "REPLACE" "[[:space:]]+"
    "(LAST[[:space:]]+)?OFF[[:space:]]*[.]"
    ;
  static regex re(pattern, extended_icase);
  cmatch cm;

  // REPLACE [LAST] OFF?
  bool found = regex_search(mfile.ccur(),
                            const_cast<const char *>(mfile.eodata), cm, re);
  gcc_assert(found); // caller ensures

  gcc_assert(cm.size() == 2);
  // LAST OFF removes most recent REPLACE
  if( cm[1].matched ) {
    gcc_assert(TOUPPER(cm[1].first[0]) == 'L');
    if( ! replace_directives.empty() ) {
      replace_directives.pop();
    }
  } else { // OFF clears the REPLACE stack
    while( ! replace_directives.empty() ) {
      replace_directives.pop();
    }
  }

  dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": parsed '%.*s', ",
          __func__, __LINE__,
          (fmt_size_t)mfile.lineno(), int(cm[0].length()), cm[0].first );

  // Remove statement from input
  erase_line(const_cast<char*>(cm[0].first),
             const_cast<char*>(cm[0].second));

  return const_cast<char*>(cm[0].second);
}

static span_t
parse_replace_text( filespan_t& mfile ) {
  static const char pattern[] =
    /* 0 */    "REPLACE"
    /* 1 */    "([[:space:]]+ALSO)?"
    /* 2 */    "("
    /*3,4*/      "([[:space:]]+(LEADING|TRAILING))?"
    /* 5 */      "([[:space:]]+"
    /* 6 */         "==" "(=?[^=]+)+" "=="
    /*   */         "[[:space:]]+BY[[:space:]]+"
    /* 7 */         "==" "(=?[^=]+)*" "=="
    /*   */      ")"
    /*   */    ")+[[:space:]]*[.]"
    ;
  static regex re(pattern, extended_icase);
  cmatch cm;
  const size_t current_lineno(mfile.lineno());

  if( false && yy_flex_debug ) {
    auto pend = mfile.eol;
    gcc_assert(mfile.line_length() > 2);
    if( pend[-1] == '\n' ) pend -= 2;
    auto len = int(pend - mfile.cur);
    dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": parsing '%.*s",
           __func__, __LINE__,
           (fmt_size_t)current_lineno, len, mfile.cur);
  }

  if( ! regex_search(mfile.ccur(), mfile.eodata, cm, re) ) {
    dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
            ": not a REPLACE statement:\n'%.*s'",
            __func__, __LINE__, (fmt_size_t)current_lineno,
            int(mfile.line_length()), mfile.cur );
    return span_t();
  }

  // Report findings.
    if( yy_flex_debug ) {
      dbgmsg(HOST_SIZE_T_PRINT_UNSIGNED " expressions",
             (fmt_size_t)std::count(pattern, pattern + sizeof(pattern), '('));
      int i = 0;
      for( const auto& m : cm ) {
        if( m.matched )
          dbgmsg("%s:%d: %2d: '%.*s'", __func__, __LINE__,
                i, int(m.length()), m.first);
        i++;
      }
    }

  gcc_assert(cm.size() > 7);

  // Update active REPLACE stack
  if( ! cm[1].matched ) { // ALSO pushes, else clear stack and push one.
    while( !replace_directives.empty() ) {
      replace_directives.pop();
    }
  } else {
    gcc_assert(TOUPPER(cm[1].first[0]) == 'A');
  }

  span_t replace_stmt(cm[0].first, cm[0].second);

  std::pair<std::list<replace_t>, char*>
        result = parse_replace_pairs(replace_stmt.p, replace_stmt.pend, false);
  const std::list<replace_t>& replacements(result.first);
  replace_directives.push( replacements );

  if( yy_flex_debug ) {
    dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": " HOST_SIZE_T_PRINT_UNSIGNED
            " pairs parsed from  '%.*s'", __func__, __LINE__,
            (fmt_size_t)current_lineno, (fmt_size_t)replacements.size(),
            int(replace_stmt.size()), replace_stmt.p );
    for( const auto& replace : replacements ) {
      int i = 0;
      dbgmsg("%s:%d:%4d: '%s' => '%s'", __func__, __LINE__,
            ++i, replace.before.p, replace.after.p);
    }
  }

  // Remove statement from input
  erase_line(const_cast<char*>(replace_stmt.p),
             const_cast<char*>(replace_stmt.pend));

  return replace_stmt;
}

static span_t
parse_replace_directive( filespan_t& mfile ) {
  static const char *most_recent_buffer, *next_directive;
  static bool off_coming_up;
  static const char pattern[] =
    "REPLACE" "[[:space:]]+" "(LAST|OFF|ALSO|LEADING|TRAILING|==)";

  static regex re(pattern, extended_icase);
  cmatch cm;

  // REPLACE appears in current buffer?
  if( most_recent_buffer != mfile.data || next_directive < mfile.cur ) {
    most_recent_buffer = mfile.data;
    next_directive = mfile.eodata;

    if( regex_search(mfile.ccur(),
                     const_cast<const char *>(mfile.eodata), cm, re) ) {
      gcc_assert(cm[1].matched);
      next_directive = cm[0].first;

      switch( TOUPPER(cm[1].first[0]) ) {
      case 'L':
        off_coming_up = 'A' == TOUPPER(cm[1].first[1]); // LAST OFF, else LEADING
        break;
      case 'O': // OFF
        off_coming_up = true;
        break;
      case 'A': case 'T': case '=': // [ALSO] [ eading/Trailing] == ...
        off_coming_up = false;
        break;
      default:
        gcc_unreachable();
      }
    }
  }

  span_t erased;
  // REPLACE appears on current line?
  if( mfile.cur <= next_directive &&
                   next_directive < mfile.eol ) {
    if( off_coming_up ) {
      parse_replace_last_off(mfile);
    } else {
      erased = parse_replace_text(mfile);
    }
  }
  return erased;
}

/*
 * Maintain the number of newlines by counting those that will be
 * overwritten, and appending them to the appended line.  Return the
 * new EOL pointer.
 *
 * The newlines accumulate past eodata, at the start of the blank
 * lines created by the caller.
 */
char *
bytespan_t::append( const char *input, const char *eoinput ) {
  gcc_assert(data < eodata);

#define LEXIO 0
#if LEXIO
  auto nq = std::count_if(data, eodata, isquote);
  dbgmsg("%s:%3d:  input ------ '%.*s'", __func__, __LINE__, int(eoinput - input), input);
  dbgmsg("%s:%3d:  precondition '%.*s' (" HOST_SIZE_T_PRINT_UNSIGNED ": %s)",
         __func__, __LINE__,
         int(size()), data, (fmt_size_t)nq, in_string()? "in string" : "not in string");
#endif
  if( !in_string() ) { // Remove trailing space unless it's part of a literal.
    while(data < eodata && ISSPACE(eodata[-1])) eodata--;
    gcc_assert(ISSPACE(eodata[0]));
    gcc_assert(data == eodata || !ISSPACE(eodata[-1]));
  }
  // skip leading blanks
  while( input < eoinput && ISSPACE(*input) ) input++;
  if( isquote(*input) ) input++;

  size_t len = eoinput - input;
  char * pend = eodata + len;

  int nnl =   std::count(eodata,   pend, '\n');   // newlines to be overwritten
  gcc_assert(0 == std::count(input, eoinput, '\n'));  // newlines in input

  memmove(eodata, input, len);
  nnl += std::count(pend, pend + nnl, '\n'); // other newlines to be overwritten
  std::fill(pend, pend + nnl, '\n');

  eodata = pend;

#if LEXIO
  dbgmsg("%s:%3d: postcondition '%.*s'", __func__, __LINE__, int(size() + len) - 1, data);
#endif

  return eodata;
}

const char * cobol_filename();

static filespan_t&
mapped_file( FILE *input ) {
  static std::map<int, filespan_t> inputs;

  int fd = fileno(input);
  gcc_assert(fd > 0);

  filespan_t& mfile = inputs[fd];
  if( mfile.data ) {
    return mfile;
  }

  struct stat sb;
  if( 0 != fstat(fd, &sb) ) {
    cbl_err( "%s: could not stat fd %d", __func__, fd );
  }

  mfile.use_nada();

  if( sb.st_size > 0 ) {
    static const int flags = MAP_PRIVATE;

    void *p = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, flags, fd, 0);
    if( p == MAP_FAILED ) {
      cbl_err( "%s: could not map fd %d", __func__, fd );
    }

    mfile.lineno_reset();
    mfile.data = mfile.cur = mfile.eol = mfile.eodata = static_cast<char*>(p);
    mfile.eodata += sb.st_size;
  }
  return mfile;
}

char filespan_t::empty_file[8] = "      \n";

static void unmap_file( filespan_t& mfile  ) {
  if( ! mfile.nada() ) {
    munmap(mfile.data, mfile.size() - 1);
  }
  mfile = filespan_t();
}

extern int yylineno;

static void
print_lexer_input( const char *buf, const char *ebuf ) {
  const char *eol, *lexio = getenv("lexio");
  int i;
  static int nbuf = 1;
  static FILE *output = NULL;

  if( !lexio ) return;
  if( !output ) {
    output = fopen( lexio, "w" );
    if( !output ) output = stderr;
  }

  fprintf( output, "*> buffer %d\n", nbuf );
  for( i = 0, eol = std::find(buf, ebuf, '\n');
       eol != ebuf; buf = eol, eol = std::find(buf, ebuf, '\n'), i++ ) {
    eol++;
    fprintf( output, "%5d %.*s", yylineno + i, int(eol - buf), buf );
  }
  if( buf < ebuf ) {
    fprintf( output, "%5d %.*s", yylineno + i, int(eol - buf), buf );
  }
  fprintf( output, "*> endbuf %d\n", nbuf++ );
  fflush(output);
}

/*
 * Fill about as much of the lexer's buffer as possible, except skip
 * leading blanks on blank lines.
 */
int
lexer_input( char buf[], int max_size, FILE *input ) {
  filespan_t& mfile( mapped_file(input) );

  if( mfile.cur == mfile.eodata ) {
    if( mfile.cur ) unmap_file(mfile);
    return 0;
  }

  gcc_assert( mfile.data <= mfile.cur && mfile.cur < mfile.eodata );

  char *next = std::min(mfile.eodata, mfile.cur + max_size);
  buffer_t output(buf, buf + max_size); // initializes pos

  // Fill output, keeping only NL for blank lines.
  for( auto p = mfile.cur; p < next; *output.pos++ = *p++ ) {
    static bool at_bol = false;
    if( at_bol ) {
      auto nonblank_l = std::find_if( p, next,
                                    []( char ch ) {
                                      return !isblank(ch); } );
      if( nonblank_l + 1 < next ) {
        if( *nonblank_l == '\r' ) nonblank_l++; // Windows
        if( *nonblank_l == '\n' ) {
          p = nonblank_l;
          continue;
        }
      }
    }
    at_bol = *p == '\n';
  }

  gcc_assert(  output.pos <= output.eodata );
  output.eodata = output.pos;

  mfile.cur = next;
  gcc_assert(mfile.cur <= mfile.eodata);

  // Buffer full or input exhausted.
  print_lexer_input(output.data, output.eodata);

  return output.size();
}

static const char *
find_filter( const char filter[] ) {

  if( 0 == access(filter, X_OK) ) {
    return filter;
  }

  const char *path = getenv("PATH");
  if( ! path ) return NULL;
  char *p = xstrdup(path), *eopath = p + strlen(p);

  while( *p != '\0' ) {
    auto pend = std::find( p, eopath, ':' );
    if( *pend == ':' ) *pend++ = '\0';

    char *name = xasprintf( "%s/%s", p, filter );

    if( 0 == access(name, X_OK) ) {
      return name;
    }
    p = pend;
  }
  return NULL;
}

bool verbose_file_reader = false;

typedef std::pair <char *, std::list<std::string> > preprocessor_filter_t;
static std::list<preprocessor_filter_t> preprocessor_filters;
static std::list<const char *> included_files;

/*
 * Keep a list of files added with -include on the command line.
 */
bool
include_file_add(const char filename[]) {
  struct stat sb;
  if( -1 == stat(filename, &sb) ) return false;
  included_files.push_back(filename);
  return true;
}

bool
preprocess_filter_add( const char input[] ) {
  std::list <std::string> options;
  std::string filter(input);
  size_t pos = filter.find(",");

  if( pos != filter.npos ) {
    std::vector<char> others( filter.size() - pos, '\0' );
    std::copy( filter.begin() + pos + 1, filter.end(), others.begin() );
    filter.resize(pos);
    char *optstr = others.data();
    for( char *opt = optstr + 1; (opt = strtok(opt, ",")); opt = NULL ) {
      options.push_back(opt);
    }
  }

  auto filename = find_filter(filter.c_str());
  if( !filename ) {
    cbl_message(LexPreprocessE, "preprocessor '%s/%s' not found",
                getcwd(NULL, 0), filter.c_str());
    return false;
  }
  preprocessor_filters.push_back( std::make_pair(xstrdup(filename), options) );
  return true;
}

void
cdftext::echo_input( int input, const char filename[] ) {
  int fd;
  if( -1 == (fd = dup(input)) ) {
      cbl_message(LexPreprocessE, "could not open preprocessed file "
                                 "%s to echo to standard output", filename );
      return;
  }

  auto mfile = map_file(fd);

  if( -1 == write(STDOUT_FILENO, mfile.data, mfile.size()) ) {
    cbl_message(LexPreprocessE, "could not write preprocessed file %s to standard output",
          filename );
  }
  if( -1 == munmap(mfile.data, mfile.size()) ) {
    cbl_message(LexPreprocessE, "could not release mapped file" );
  }
  if( -1 == close(fd) ) {
    cbl_message(LexPreprocessE, "could not close mapped file" );
  }
}

static inline ino_t
inode_of( int fd ) {
  struct stat sb;
  if( -1 == fstat(fd, &sb) ) {
    cbl_err( "could not stat fd %d", fd);
  }
  return sb.st_ino;
}

FILE *
cdftext::lex_open( const char filename[] ) {
  int input = open_input( filename );
  if( input == -1 ) return NULL;

  int output = open_output();

  // Process any files supplied by the -include command-line option.
  for( auto name : included_files ) {
    if( -1 == (input = open(name, O_RDONLY)) ) {
      cbl_message(LexIncludeE, "cannot open %<-include%> file %qs", name);
      continue;
    }
    cobol_filename(name, inode_of(input));
    filespan_t mfile( free_form_reference_format( input ) );

    process_file( mfile, output );

    cobol_filename_restore(); // process_file restores only for COPY
  }
  included_files.clear();

  cobol_filename(filename, inode_of(input));
  filespan_t mfile( free_form_reference_format( input ) );

  process_file( mfile, output );

  if( lexer_echo() ) {
    echo_input(output, filename);
  }

  for( auto filter_pair : preprocessor_filters ) {
    input  = output;
    output = open_output();

    char *filter = filter_pair.first;
    std::list<std::string>& options = filter_pair.second;

    std::vector <char*> argv(2 + options.size(), NULL);
    argv[0] = filter;

    auto last_argv = std::transform( options.begin(), options.end(), argv.begin() + 1,
                                     []( const std::string& opt ) {
                                       return xstrdup(opt.c_str());
                                     } );
    *last_argv = NULL;

    pid_t pid = fork();

    switch(pid){
    case -1: cbl_err( "%s", __func__);
      break;
    case 0: // child
      if( -1 == dup2(input, STDIN_FILENO) ) {
        cbl_err( "%s: could not dup input", __func__);
      }
      if( -1 == dup2(output, STDOUT_FILENO) ) {
        cbl_err( "%s: could not dup output", __func__);
      }
      if( -1 == lseek(STDIN_FILENO, SEEK_SET, 0) ) {
        cbl_err( "%s: could not seek to start of file", __func__);
      }
      int erc;
      if( -1 == (erc = execv(filter, argv.data())) ) {
        cbl_message(LexPreprocessE, "could not execute %s", filter);
      }
      _exit(erc);
    }
    int status;
    auto kid = wait(&status);
    gcc_assert(pid == kid);
    if( kid == -1 ) cbl_err( "failed waiting for pid %ld", static_cast<long>(pid));

    if( WIFSIGNALED(status) ) {
      cbl_errx( "%s pid %ld terminated by %s",
                filter, static_cast<long>(kid), strsignal(WTERMSIG(status)) );
    }
    if( WIFEXITED(status) ) {
      if( (status = WEXITSTATUS(status)) != 0 ) {
        cbl_errx( "%s exited with status %d",
             filter, status);
      }
    }
    cbl_message(LexIncludeOkN, "applied %s", filter );
  }

  return fdopen( output, "r");
}

int
cdftext::open_input( const char filename[] ) {
  int fd = open(filename, O_RDONLY);
  if( fd == -1 ) {
    dbgmsg( "could not open '%s': %s", filename, xstrerror(errno) );
  }

  verbose_file_reader = NULL != getenv("GCOBOL_TEMPDIR");

  if( verbose_file_reader ) {
    cbl_message(LexInputN, "verbose: opening %s for input", filename);
  }
  return fd;
}

int
cdftext::open_output() {
  char *name = getenv("GCOBOL_TEMPDIR");

  if( name && 0 != strcmp(name, "/") ) {
    int fd;
    char * stem = xasprintf("%sXXXXXX", name);
    if( -1 == (fd = mkstemp(stem)) ) {
      cbl_err( "could not open temporary file '%s' (%s)",
               name, realpath(name, stem));
    }
    return fd;
  }

  FILE *fh = tmpfile();
  if( !fh ) {
    cbl_err("could not create temporary file");
  }

  return fileno(fh);
}

filespan_t
cdftext::map_file( int fd ) {
  gcc_assert(fd > 0);

  filespan_t mfile;
  mfile.use_nada();

  struct stat sb;
  do {
    if( 0 != fstat(fd, &sb) ) {
      cbl_err( "%s: could not stat fd %d", __func__, fd );
    }
    if( S_ISFIFO(sb.st_mode) ) {
      // Copy FIFO to regular file that can be mapped.
      int input = open_output();
      std::swap(fd, input); // fd will continue to be the input
      static char block[4096 * 4];
      ssize_t n;
      while( (n = read(input, block, sizeof(block))) != 0 ) {
        ssize_t nout = write(fd, block, n);
        if( nout != n ) {
          cbl_err( "%s: could not prepare map file from FIFO %d",
              __func__, input);
        }
        if( false ) dbgmsg("%s: copied " HOST_SIZE_T_PRINT_DEC " bytes from FIFO",
                                __func__, (fmt_size_t)nout);
      }
    }
  } while( S_ISFIFO(sb.st_mode) );

  if( sb.st_size > 0 ) {
    static const int flags = MAP_PRIVATE;

    void *p = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, flags, fd, 0);
    if( p == MAP_FAILED ) {
      cbl_err( "%s: could not map fd %d", __func__, fd );
    }

    mfile.lineno_reset();
    mfile.data = mfile.cur = mfile.eol = mfile.eodata = static_cast<char*>(p);
    mfile.eodata += sb.st_size;
  }

  return mfile;
}

bool lexio_dialect_mf();

/*
 * A valid sequence area is 6 digits or blanks at the begining of the line that
 * contains PROGRAM-ID. Return NULL if no valid sequence area, else return
 * pointer to BOL.
 */
static const char *
valid_sequence_area( const char *data, const char *eodata ) {

  for( const char *p = data;
       (p = std::find_if(p, eodata, is_p)) != eodata;
       p++ )
  {
    auto eol = std::find(p, eodata, '\n');
    if( p == data || ISSPACE(p[-1]) ) {
      if( is_program_id(p, eol) ) {  // found program-id token
	const char *bol = p;
	for( ; data <= bol-1 && bol[-1] != '\n'; --bol )
	  ;
	if( 6 < p - bol ) {
	  if( std::all_of(bol, bol+6, ::isdigit) ) {
	    return bol;
	  }
	  if( std::all_of(bol, bol+6, ::isblank) ) {
	    return bol;
	  }
	  break;
	}
      }
    }
  }
  return nullptr;
}

/*
 * Reference Format -- valid COBOL between columns 8 and 72 -- has data after
 * column 72 on the PROGRAM-ID line. Extended Reference Format (that allows
 * longer lines) has no reason to follow the PROGRAM-ID with more stuff.
 */
static bool
infer_reference_format( const char *bol, const char *eodata ) {
  assert(bol);
  auto eol = std::find(bol, eodata, '\n');
  if( 72 < eol - bol ) {
    return ! std::all_of(bol + 72, eol, ::isspace);
  }
  return false;
}

filespan_t
cdftext::free_form_reference_format( int input ) {
  filespan_t source_buffer = map_file(input);
  filespan_t mfile(source_buffer);

  /*
   * current_line_t describes the segment of mapped file that is the
   * "current line" being processed.  Its only use is for line
   * continuation, whether string literals or not.
   */
  struct current_line_t {
    size_t lineno;
    bytespan_t line;
    // construct with length zero
    explicit current_line_t( char data[] ) : lineno(0), line(data, data) {}
  } current( mfile.data );

  /*
   * Infer source code format.
   */
  if( cdf_source_format().inference_pending()  ) {
    const char *bol = valid_sequence_area(mfile.data, mfile.eodata);
    if( bol ) {
      cdf_source_format().infer( bol, infer_reference_format(bol, mfile.eodata) );
    }
  }

  while( mfile.next_line() ) {
    check_push_pop_directive(mfile);
    check_source_format_directive(mfile);
    remove_inline_comment(mfile.cur, mfile.eol);

    if( mfile.is_blank_line() ) continue;

    char *indcol = indicated(mfile.cur, mfile.eol); // true only for fixed
    //                                              // format

    if( is_fixed_format() && !indcol ) { // short line
      erase_source(mfile.cur, mfile.eol);
    }

    if( indcol ) {
      // Set to blank columns 1-6 and anything past the right margin.
      erase_source(mfile.cur, indcol);
      if( is_reference_format() ) {
        if( mfile.cur + right_margin() <  mfile.eol ) {
          auto p = std::find(mfile.cur + right_margin(), mfile.eol, '\n');
          erase_source(mfile.cur + right_margin(), p);
        }
      }

      switch( TOUPPER(*indcol) ) {
      case '-':
        gcc_assert(0 < current.line.size());
        /*
         * The "current line" -- the line being continued -- may be many
         * lines earlier (with many intervening newlines) or may intrude
         * on its succeeding line.  Erase the continuation line.
         */
        {
          char *pend = mfile.eol;
          if( right_margin() ) {
            pend = std::min(mfile.cur + right_margin(), mfile.eol);
          }
          // The appended segment has no newline because the erased line retains
          // one.
          pend = std::find(indcol + 1, pend, '\n');
          char *p = current.line.append(indcol + 1, pend );
          if( (p = std::max(p, mfile.cur)) < mfile.eol ) {
            erase_source(p, mfile.eol);
          }
        }
        continue;
      case SPACE:
        break;
      case 'D':
        /*
         * Pass the D to the lexer, because WITH DEBUGGING MODE is
         * parsed in the parser.  This assumes too strict a rule: that
         * all the source is in one format. In fact, DEBUGGING MODE
         * could be set on, and >>SOURCE-FORMAT can switch back and
         * forth. To solve that, we'd have to parse WITH DEBUGGING MODE
         * in free_form_reference_format(), which is a lot of work for
         * an obsolete feature.
         */
        break;
      case '*':
      case '/':
        if( indcol < mfile.eol - 1 ) {
          erase_source(indcol, mfile.eol);
        }
        continue;
      case '$':
        if( lexio_dialect_mf() ) {
          break;
        }
        __attribute__ ((fallthrough));
      default: // flag other characters in indicator area
        if( ! ISSPACE(indcol[0]) ) {
          yyerrorvl( mfile.lineno(), cobol_filename(),
                     "error: stray indicator '%c' (0x%x): \"%.*s\"",
                     indcol[0], indcol[0],
                     int(mfile.line_length() - 1), mfile.cur );
          *indcol = SPACE;
        }
        break;
      }
    }
    current.line.update(mfile.cur, mfile.eol, right_margin());
    current.lineno = mfile.lineno();
  } // next line

  return source_buffer;
}

/*
 * process_file is a recursive routine that opens and processes
 * included files.  It uses the input file stack in two ways: to check
 * copybook uniqueness, and (via the lexer) to keep track filenames
 * and line numbers.
 *
 * When reading copybook files, the copybook object enforces the rule
 * that no copybook may include itself, even indirectly. It does that
 * by relying on the unique_stack to deny a push.  Because the reader
 * makes no attempt to count lines, line numbers in the input stack
 * are all 1 at this point.
 *
 * When returning from the top-level recursion, the input stack has
 * the original file's name on top, with depth 1.  At that point, the
 * lexer begins tokenizing the input.
 *
 * The input stream sent to the lexer is delimited by #FILE tokens
 * denoting the source filename.  As far as the lexer is concerned,
 * there's only ever one file: the name passed to lex_open() when we
 * kicked things off.  But messages and the debugger need to know
 * which file and line each statment appeared in.
 *
 * The lexer uses the input stack to keep track of names and
 * numbers. The top of the input file stack is the current file
 * context, initially set to line 1. When the lexer sees a push, it
 * updates the top-of-stack with the current line number, yylineno,
 * and then pushes the copybook filename with line 1. When it sees a
 * pop, the current file is popped, of course; its line number no
 * longer matters. Then the top-of-stack is used to update the current
 * cobol filename and yylineno.
 */
void
cdftext::process_file( filespan_t mfile, int output, bool second_pass ) {
  static size_t nfiles = 0;

  __gnu_cxx::stdio_filebuf<char> outbuf(fdopen(output, "a"), std::ios::out);
  std::ostream out(&outbuf);
  std::ostream_iterator<char> ofs(out);

  // indicate current file
  static const char file_push[] = "\f#FILE PUSH ", file_pop[] = "\f#FILE POP\f";

  if( !included_files.empty() ) { ++nfiles; }; // force push/pop of included filename
  if( !second_pass && nfiles++ ) {
    static const char delimiter[] = "\f";
    const char *filename = cobol_filename();
    std::copy(file_push, file_push + strlen(file_push), ofs);
    std::copy(filename, filename + strlen(filename), ofs);
    std::copy(delimiter, delimiter + strlen(delimiter), ofs);
    out.flush();
  }

  // parse CDF directives
  while( mfile.next_line() ) {
    yylloc = mfile.as_location();
    auto copied = parse_copy_directive(mfile);
    if( copied.parsed && copied.fd != -1 ) {
      gcc_assert(copied.erased_lines.p);
      std::copy_if(copied.erased_lines.p, copied.erased_lines.pend, ofs,
                   []( char ch ) { return ch == '\n'; } );
      struct { int in, out; filespan_t mfile; } copy;
      dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ", opening %s on fd %d",
             __func__, __LINE__, (fmt_size_t)mfile.lineno(),
             copybook.source(), copybook.current()->fd);
      copy.in = copybook.current()->fd;
      copy.mfile = free_form_reference_format( copy.in );

      if( copied.partial_line.size() ) {
        std::copy(copied.partial_line.p, copied.partial_line.pend, ofs);
      }
      out.flush();

      if( copied.nreplace == 0 ) {
        // process with extant REPLACE directive
        process_file(copy.mfile, output);
      } else {
        copy.out = open_output();
        // process to intermediate, applying COPY ... REPLACING
        process_file(copy.mfile, copy.out);
        copy.mfile = map_file(copy.out);
        replace_directives.pop();
        // process intermediate with extant REPLACE directive
        process_file(copy.mfile, output, true);
        // COPY statement is erased from input if processed successfully
      }
      cobol_filename_restore();
    }

    auto erased = parse_replace_directive(mfile);
    if( erased.p ) {
      std::copy_if( erased.p, erased.pend, ofs,
                    []( char ch ) { return ch == '\n'; } );
    }
    if( replace_directives.empty() ) {
      std::copy(mfile.cur, mfile.eol, ofs);
      continue; // No active REPLACE directive.
    }

    std::list<span_t> segments = segment_line(mfile);

    for( const auto& segment : segments ) {
      std::copy(segment.p, segment.pend, ofs);
    }

    out.flush();
  }
  // end of file
  if( !second_pass && --nfiles ) {
    std::copy(file_pop, file_pop + strlen(file_pop), ofs);
    out.flush();
  }
  if( !included_files.empty() ) { --nfiles; };
}

std::list<span_t>
cdftext::segment_line( filespan_t& mfile ) {
  std::list<span_t> output;

  gcc_assert( ! replace_directives.empty() );
  std::list<replace_t> pending;
  recognize_replacements( mfile, pending );

  if( pending.empty() ) {
    output.push_back( span_t(mfile.cur, mfile.eol) );
    return output;
  }

  /*
   * If the replacement changes the number of lines in the replaced text, we
   * need to reset the line number, because the next statement is on a
   * different line in the manipulated text than in the original.  Before each
   * replacement, set the original line number.  After each replacement, set
   * the line number after the elided text on the next line.
   */
  for( const replace_t& segment : pending ) {
    gcc_assert(mfile.cur <= segment.before.p);
    gcc_assert(segment.before.pend <= mfile.eodata);

    struct { unsigned long ante, post; } lineno = {
      gb4(mfile.lineno()), gb4(mfile.lineno() + segment.after.nlines())
    };
    const char *directive = lineno.ante == lineno.post?
      nullptr : xasprintf("\n#line %lu \"%s\"\n",
                          lineno.ante, cobol_filename());

    if( directive )
      output.push_back( span_t(strlen(directive), directive) );
    output.push_back( span_t(mfile.cur, segment.before.p) );
    output.push_back( span_t(segment.after.p, segment.after.pend ) );
    if( directive )
      output.push_back( span_t(strlen(directive), directive) );

    mfile.cur = const_cast<char*>(segment.before.pend);
  }

  if( mfile.eol < mfile.cur ) {
    if( (mfile.eol = std::find(mfile.cur, mfile.eodata, '\n')) < mfile.eodata ) {
      mfile.eol++;
    }
  }

  // last segment takes to EOL
  output.push_back( span_t(mfile.cur, mfile.eol) );

  return output;
}