1 files changed, 151 insertions, 106 deletions
diff --git a/gcc/cobol/lexio.cc b/gcc/cobol/lexio.cc
index a992166..2d9fb72 100644
--- a/gcc/cobol/lexio.cc
+++ b/gcc/cobol/lexio.cc
@@ -46,8 +46,22 @@ static struct {
     first_file = false;
     return tf;
   }
+  inline bool is_fixed() const { return column == 7; }
+  inline bool is_reffmt() const { return is_fixed() && right_margin == 73; }
+  inline bool is_free() const { return ! is_fixed(); }
+  
+  const char * description() const {
+    if( is_reffmt() ) return "REFERENCE";
+    if( is_fixed() ) return "FIXED";
+    if( is_free() ) return "FREE";
+    gcc_unreachable();
+  }    
 } indicator = { true, false, 0, 0 };
 
+// public source format test functions
+bool is_fixed_format() { return indicator.is_fixed(); }
+bool is_reference_format() { return indicator.is_reffmt(); }
+
 static bool debug_mode = false;
 
 /*
@@ -86,10 +100,6 @@ cobol_set_indicator_column( int column )
   indicator.column = column;
 }
 
-bool is_fixed_format() { return indicator.column == 7; }
-bool is_reference_format() {
-  return indicator.column == 7 && indicator.right_margin == 73;
-}
 bool include_debug()      { return indicator.column == 7 && debug_mode; }
 bool set_debug( bool tf ) { return debug_mode = tf && is_fixed_format(); }
 
@@ -113,7 +123,7 @@ continues_at( char *bol, char *eol ) {
 // Return pointer to indicator column. Test ch if provided.
 // NULL means no indicator column or tested value not present.
 static inline char *
-indicated( char *bol, char *eol, char ch = '\0' ) {
+indicated( char *bol, const char *eol, char ch = '\0' ) {
   if( indicator.column == 0 && *bol != '*' ) {
     return NULL;  // no indicator column in free format, except for comments
   }
@@ -130,10 +140,10 @@ indicated( char *bol, char *eol, char ch = '\0' ) {
 
 static char *
 remove_inline_comment( char *bol, char *eol ) {
-  static char ends = '\0';
   char *nl = std::find(bol, eol, '\n');
 
   if( bol < nl ) {
+    static char ends = '\0';
     std::swap(*nl, ends);
     char *comment = strstr(bol, "*>");
     if( comment ) {
@@ -198,10 +208,10 @@ maybe_add_space(const span_t& pattern, replace_t& recognized) {
   }
 
   if( befter[0] == blank || befter[1] == blank ) {
-    char *s = xasprintf( "%s%.*s%s",
-                         befter[0],
-                         recognized.after.size(), recognized.after.p,
-                         befter[1] );
+    const char *s = xasprintf( "%s%.*s%s",
+                               befter[0],
+                               recognized.after.size(), recognized.after.p,
+                               befter[1] );
     recognized.after = span_t(s, s + strlen(s));
   }
 }
@@ -256,7 +266,9 @@ recognize_replacements( filespan_t mfile, std::list<replace_t>& pending_replacem
 
       span_t found(mfile.eodata, mfile.eodata);
 
-      if( regex_search( mfile.ccur(), (const char *)mfile.eodata, cm, re) ) {
+      if( regex_search( mfile.ccur(),
+                        const_cast<const char *>(mfile.eodata),
+                        cm, re) ) {
         gcc_assert(cm[1].matched);
         found = span_t( cm[1].first, cm[1].second );
         if( yy_flex_debug ) {
@@ -291,7 +303,8 @@ recognize_replacements( filespan_t mfile, std::list<replace_t>& pending_replacem
     bol = next.found.pend;
 
     if( yy_flex_debug ) {
-      size_t n = std::count((const char *)mfile.data, recognized.before.p, '\n');
+      size_t n = std::count(const_cast<const char *>(mfile.data),
+                            recognized.before.p, '\n');
       dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
               " @ " HOST_SIZE_T_PRINT_UNSIGNED ": '%s'\n/%.*s/%.*s/",
               __func__, __LINE__,
@@ -307,10 +320,11 @@ recognize_replacements( filespan_t mfile, std::list<replace_t>& pending_replacem
     next.found = span_t(mfile.eodata, mfile.eodata);
 
     regex re(next.directive.before.p, extended_icase);
-    if( regex_search(bol, (const char *)mfile.eodata, cm, re) ) {
+    if( regex_search(bol, const_cast<const char *>(mfile.eodata), cm, re) ) {
       gcc_assert(cm[1].matched);
       next.found = span_t( cm[1].first, cm[1].second );
-      size_t n = std::count((const char *)mfile.data, next.found.p, '\n');
+      size_t n = std::count(const_cast<const char *>(mfile.data),
+                            next.found.p, '\n');
       if( false )
         dbgmsg("%s:%d next '%.*s' will be on line " HOST_SIZE_T_PRINT_UNSIGNED
                " (offset " HOST_SIZE_T_PRINT_UNSIGNED ")", __func__, __LINE__,
@@ -335,7 +349,7 @@ check_source_format_directive( filespan_t& mfile ) {
 
   // show contents of marked subexpressions within each match
   cmatch cm;
-  if( regex_search(p, (const char *)mfile.eol, cm, re) ) {
+  if( regex_search(p, const_cast<const char *>(mfile.eol), cm, re) ) {
     gcc_assert(cm.size() > 1);
     switch( cm[3].length() ) {
     case 4:
@@ -348,13 +362,14 @@ check_source_format_directive( filespan_t& mfile ) {
       gcc_assert(cm[3].length() == 4 || cm[3].length() == 5);
       break;
     }
-    mfile.cur = const_cast<char*>(cm[0].second);
+
     dbgmsg( "%s:%d: %s format set, on line " HOST_SIZE_T_PRINT_UNSIGNED,
             __func__, __LINE__,
             indicator.column == 7? "FIXED" : "FREE",
             (fmt_size_t)mfile.lineno() );
-    erase_line(const_cast<char*>(cm[0].first),
-               const_cast<char*>(cm[0].second));
+    char *bol = indicator.is_fixed()? mfile.cur : const_cast<char*>(cm[0].first);
+    erase_line(bol, const_cast<char*>(cm[0].second));
+    mfile.cur = const_cast<char*>(cm[0].second);
   }
 }
 
@@ -393,33 +408,22 @@ struct buffer_t : public bytespan_t {
   }
 };
 
-static bool
-valid_sequence_area( const char *p, const char *eodata ) {
-  const char *pend = p + 6;
-  if ( eodata < pend ) return false;
+static inline bool is_p( char ch ) { return TOUPPER(ch) == 'P'; }
 
-  for( ; p < pend; p++ ) {
-    if( ! (ISDIGIT(*p) || *p == SPACE) ) {
-      return false;
+static bool
+is_program_id( const char *p, const char *eol ) {
+  static const std::string program_id("PROGRAM-ID");
+  auto eop = p + program_id.size();
+  if( eop < eol ) {
+    // PROGRAM-ID must be followed by a dot, perhaps with intervening whitespace.
+    for( const char *dot=eop; dot < eol && *dot != '.'; dot++ ) {
+      if( !ISSPACE(*dot) ) return false;
     }
+    std::string line (p, eop);
+    std::transform(line.begin(), line.end(), line.begin(), ::toupper);
+    return line == program_id;
   }
-  return true; // characters either digits or blanks
-}
-
-// Inspect the 2nd line for telltale signs of a NIST file.
-// If true, caller sets right margin to 73, indicating Reference Format
-static bool
-likely_nist_file( const char *p, const char *eodata ) {
-  if( (p = std::find(p, eodata, '\n')) == eodata ) return false;
-  if ( eodata < ++p + 80 ) return false;
-  p += 72;
-
-  return
-    ISALPHA(p[0]) && ISALPHA(p[1]) && 
-    ISDIGIT(p[2]) && ISDIGIT(p[3]) && ISDIGIT(p[4]) &&
-    p[5] == '4' &&
-    p[6] == '.' &&
-    p[7] == '2';
+  return false;
 }
 
 const char * esc( size_t len, const char input[] );
@@ -459,9 +463,9 @@ struct replacing_term_t {
   bool matched, done;
   span_t leading_trailing, term, stmt;
 
-  replacing_term_t(const char input[]) : matched(false), done(false) {
-    stmt = span_t(input, input);
-  }
+  explicit replacing_term_t(const char input[])
+    : matched(false), done(false), stmt(span_t(input, input))
+    {}
 };
 
 extern YYLTYPE yylloc;
@@ -531,7 +535,7 @@ update_yylloc( const csub_match& stmt, const csub_match& term ) {
 
 static replacing_term_t
 parse_replacing_term( const char *stmt, const char *estmt ) {
-  gcc_assert(stmt); gcc_assert(estmt); gcc_assert(stmt < estmt);
+  gcc_assert(stmt); gcc_assert(estmt); gcc_assert(stmt <= estmt);
   replacing_term_t output(stmt);
 
   static const char pattern[] =
@@ -741,7 +745,7 @@ parse_replacing_pair( const char *stmt, const char *estmt ) {
       }
     }
     if( pair.stmt.p ) {
-      yywarn("CDF syntax error '%*s'", (int)pair.stmt.size(), pair.stmt.p);
+      yywarn("CDF syntax error '%.*s'", (int)pair.stmt.size(), pair.stmt.p);
     }
     else {
       // This eliminated a compiler warning about "format-overflow"
@@ -809,7 +813,7 @@ parse_replace_pairs( const char *stmt, const char *estmt, bool is_copy_stmt ) {
     }
 
     span_t& before(parsed.replace.before);
-    span_t& after(parsed.replace.after);
+    const span_t& after(parsed.replace.after);
 
     const char *befter[2] = { nonword_ch, nonword_ch };
     gcc_assert(before.p < before.pend);
@@ -877,7 +881,7 @@ struct copy_descr_t {
 };
 
 static YYLTYPE
-location_in( const filespan_t& mfile, const csub_match cm ) {
+location_in( const filespan_t& mfile, const csub_match& cm ) {
   YYLTYPE loc {
     int(mfile.lineno() + 1), int(mfile.colno() + 1),
     int(mfile.lineno() + 1), int(mfile.colno() + 1)
@@ -928,7 +932,7 @@ parse_copy_directive( filespan_t& mfile ) {
     copy_stmt.p = mfile.eodata;
 
     if( regex_search(mfile.ccur(),
-                          (const char *)mfile.eodata, cm, re) ) {
+                     const_cast<const char *>(mfile.eodata), cm, re) ) {
       copy_stmt = span_t( cm[0].first, cm[0].second );
       if( yy_flex_debug ) {
         size_t nnl = 1 + count_newlines(mfile.data, copy_stmt.p);
@@ -981,7 +985,7 @@ parse_copy_directive( filespan_t& mfile ) {
       std::pair<std::list<replace_t>, char*>
         result = parse_replace_pairs( cm[0].second, mfile.eodata, true );
 
-      std::list<replace_t>& replacements(result.first);
+      const std::list<replace_t>& replacements(result.first);
       outcome.parsed = (outcome.nreplace = replacements.size()) > 0;
       if( outcome.parsed ) {
         replace_directives.push(replacements);
@@ -1008,7 +1012,7 @@ parse_copy_directive( filespan_t& mfile ) {
 }
 
 static char *
-parse_replace_last_off( filespan_t& mfile ) {
+parse_replace_last_off( const filespan_t& mfile ) {
   static const char pattern[] =
     "REPLACE" "[[:space:]]+"
     "(LAST[[:space:]]+)?OFF[[:space:]]*[.]"
@@ -1018,7 +1022,7 @@ parse_replace_last_off( filespan_t& mfile ) {
 
   // REPLACE [LAST] OFF?
   bool found = regex_search(mfile.ccur(),
-                                 (const char *)mfile.eodata, cm, re);
+                            const_cast<const char *>(mfile.eodata), cm, re);
   gcc_assert(found); // caller ensures
 
   gcc_assert(cm.size() == 2);
@@ -1073,7 +1077,7 @@ parse_replace_text( filespan_t& mfile ) {
            (fmt_size_t)current_lineno, len, mfile.cur);
   }
 
-  if( ! regex_search(mfile.ccur(), (const char *)mfile.eodata, cm, re) ) {
+  if( ! regex_search(mfile.ccur(), mfile.eodata, cm, re) ) {
     dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
             ": not a REPLACE statement:\n'%.*s'",
             __func__, __LINE__, (fmt_size_t)current_lineno,
@@ -1109,7 +1113,7 @@ parse_replace_text( filespan_t& mfile ) {
 
   std::pair<std::list<replace_t>, char*>
         result = parse_replace_pairs(replace_stmt.p, replace_stmt.pend, false);
-  std::list<replace_t>& replacements(result.first);
+  const std::list<replace_t>& replacements(result.first);
   replace_directives.push( replacements );
 
   if( yy_flex_debug ) {
@@ -1147,7 +1151,7 @@ parse_replace_directive( filespan_t& mfile ) {
     next_directive = mfile.eodata;
 
     if( regex_search(mfile.ccur(),
-                          (const char *)mfile.eodata, cm, re) ) {
+                     const_cast<const char *>(mfile.eodata), cm, re) ) {
       gcc_assert(cm[1].matched);
       next_directive = cm[0].first;
 
@@ -1409,7 +1413,7 @@ preprocess_filter_add( const char input[] ) {
 
   auto filename = find_filter(filter.c_str());
   if( !filename ) {
-    yywarn("preprocessor '%s/%s' not found", getcwd(NULL, 0), filter);
+    yywarn("preprocessor '%s/%s' not found", getcwd(NULL, 0), filter.c_str());
     return false;
   }
   preprocessor_filters.push_back( std::make_pair(xstrdup(filename), options) );
@@ -1455,7 +1459,7 @@ cdftext::lex_open( const char filename[] ) {
 
   int output = open_output();
 
-  // Process any files supplied by the -include comamnd-line option.
+  // Process any files supplied by the -include command-line option.
   for( auto name : included_files ) {
     int input;
     if( -1 == (input = open(name, O_RDONLY)) ) {
@@ -1466,7 +1470,10 @@ cdftext::lex_open( const char filename[] ) {
     filespan_t mfile( free_form_reference_format( input ) );
 
     process_file( mfile, output );
+
+    cobol_filename_restore(); // process_file restores only for COPY
   }
+  included_files.clear();
 
   cobol_filename(filename, inode_of(input));
   filespan_t mfile( free_form_reference_format( input ) );
@@ -1488,7 +1495,7 @@ cdftext::lex_open( const char filename[] ) {
     argv[0] = filter;
 
     auto last_argv = std::transform( options.begin(), options.end(), argv.begin() + 1,
-                                     []( std::string& opt ) {
+                                     []( const std::string& opt ) {
                                        return xstrdup(opt.c_str());
                                      } );
     *last_argv = NULL;
@@ -1517,11 +1524,11 @@ cdftext::lex_open( const char filename[] ) {
     int status;
     auto kid = wait(&status);
     gcc_assert(pid == kid);
-    if( kid == -1 ) cbl_err( "failed waiting for pid %d", pid);
+    if( kid == -1 ) cbl_err( "failed waiting for pid %ld", static_cast<long>(pid));
 
     if( WIFSIGNALED(status) ) {
-      cbl_errx( "%s pid %d terminated by %s",
-           filter, kid, strsignal(WTERMSIG(status)) );
+      cbl_errx( "%s pid %ld terminated by %s",
+                filter, static_cast<long>(kid), strsignal(WTERMSIG(status)) );
     }
     if( WIFEXITED(status) ) {
       if( (status = WEXITSTATUS(status)) != 0 ) {
@@ -1539,7 +1546,7 @@ int
 cdftext::open_input( const char filename[] ) {
   int fd = open(filename, O_RDONLY);
   if( fd == -1 ) {
-    dbgmsg( "could not open '%s': %m", filename );
+    dbgmsg( "could not open '%s': %s", filename, xstrerror(errno) );
   }
 
   verbose_file_reader = NULL != getenv("GCOBOL_TEMPDIR");
@@ -1553,9 +1560,9 @@ cdftext::open_input( const char filename[] ) {
 int
 cdftext::open_output() {
   char *name = getenv("GCOBOL_TEMPDIR");
-  int fd;
 
   if( name && 0 != strcmp(name, "/") ) {
+    int fd;
     char * stem = xasprintf("%sXXXXXX", name);
     if( -1 == (fd = mkstemp(stem)) ) {
       cbl_err( "could not open temporary file '%s' (%s)",
@@ -1620,6 +1627,54 @@ cdftext::map_file( int fd ) {
 
 bool lexio_dialect_mf();
 
+/*
+ * A valid sequence area is 6 digits or blanks at the begining of the line that
+ * contains PROGRAM-ID. Return NULL if no valid sequence area, else return
+ * pointer to BOL.
+ */
+static const char *
+valid_sequence_area( const char *data, const char *eodata ) {
+  
+  for( const char *p = data;
+       (p = std::find_if(p, eodata, is_p)) != eodata;
+       p++ )
+  {
+    auto eol = std::find(p, eodata, '\n');
+    if( p == data || ISSPACE(p[-1]) ) {
+      if( is_program_id(p, eol) ) {  // found program-id token
+	const char *bol = p;
+	for( ; data <= bol-1 && bol[-1] != '\n'; --bol )
+	  ;
+	if( 6 < p - bol ) {
+	  if( std::all_of(bol, bol+6, ::isdigit) ) {
+	    return bol;
+	  }
+	  if( std::all_of(bol, bol+6, ::isblank) ) {
+	    return bol;
+	  }
+	  break;
+	}
+      }
+    }
+  }
+  return nullptr;  
+}
+
+/*
+ * Reference Format -- valid COBOL between columns 8 and 72 -- has data after
+ * column 72 on the PROGRAM-ID line. Extended Reference Format (that allows
+ * longer lines) has no reason to follow the PROGRAM-ID with more stuff.
+ */
+static bool
+infer_reference_format( const char *bol, const char *eodata ) {
+  assert(bol);
+  auto eol = std::find(bol, eodata, '\n');
+  if( 72 < eol - bol ) {
+    return ! std::all_of(bol + 72, eol, ::isspace);
+  }
+  return false;
+}
+
 filespan_t
 cdftext::free_form_reference_format( int input ) {
   filespan_t source_buffer = map_file(input);
@@ -1634,31 +1689,23 @@ cdftext::free_form_reference_format( int input ) {
     size_t lineno;
     bytespan_t line;
     // construct with length zero
-    current_line_t( char data[] ) : lineno(0), line(data, data) {}
+    explicit current_line_t( char data[] ) : lineno(0), line(data, data) {}
   } current( mfile.data );
 
   /*
-   * If the format is not explicitly set on the command line, test the
-   * first 6 bytes of the first file to determine the format
-   * heuristically. If the first 6 characters are only digits or
-   * blanks, then the file is in fixed format.
+   * Infer source code format. 
    */
-
   if( indicator.inference_pending()  ) {
-    const char *p = mfile.data;
-    while( p < mfile.eodata ) {
-      const char * pend =
-        std::find(p, const_cast<const char *>(mfile.eodata), '\n');
-      if( 6 < pend - p ) break;
-      p = pend;
-      if( p < mfile.eodata) p++;
+    const char *bol = valid_sequence_area(mfile.data, mfile.eodata);
+    if( bol ) {
+      indicator.column = 7;
+      if( infer_reference_format(bol, mfile.eodata) ) {
+	indicator.right_margin = 73;
+      }
     }
-    if( valid_sequence_area(p, mfile.eodata) ) indicator.column = 7;
-    if( likely_nist_file(p, mfile.eodata) )    indicator.right_margin = 73;
 
-    dbgmsg("%s:%d: %s%s format detected", __func__, __LINE__,
-           indicator.column == 7? "FIXED" : "FREE",
-           indicator.right_margin == 73? "" : "-extended");
+    dbgmsg("%s:%d: %s format detected", __func__, __LINE__,
+           indicator.description());
   }
 
   while( mfile.next_line() ) {
@@ -1782,15 +1829,15 @@ cdftext::free_form_reference_format( int input ) {
 void
 cdftext::process_file( filespan_t mfile, int output, bool second_pass ) {
   static size_t nfiles = 0;
-  std::list<replace_t> replacements;
 
-  __gnu_cxx::stdio_filebuf<char> outbuf(fdopen(output, "w"), std::ios::out);
+  __gnu_cxx::stdio_filebuf<char> outbuf(fdopen(output, "a"), std::ios::out);
   std::ostream out(&outbuf);
   std::ostream_iterator<char> ofs(out);
 
   // indicate current file
   static const char file_push[] = "\f#FILE PUSH ", file_pop[] = "\f#FILE POP\f";
 
+  if( !included_files.empty() ) { ++nfiles; }; // force push/pop of included filename
   if( !second_pass && nfiles++ ) {
     static const char delimiter[] = "\f";
     const char *filename = cobol_filename();
@@ -1846,31 +1893,12 @@ cdftext::process_file( filespan_t mfile, int output, bool second_pass ) {
       continue; // No active REPLACE directive.
     }
 
-    std::list<span_t> segments = segment_line(mfile); // no replace yields
-    //                                                // 1 segment
+    std::list<span_t> segments = segment_line(mfile);
 
     for( const auto& segment : segments ) {
       std::copy(segment.p, segment.pend, ofs);
     }
 
-    if( segments.size() == 2 ) {
-      struct {
-        size_t before, after;
-        int delta() const { return before - after; } } nlines;
-      nlines.before = std::count(segments.front().p,
-                                 segments.front().pend, '\n');
-      nlines.after  = std::count(segments.back().p, segments.back().pend, '\n');
-      if( nlines.delta() < 0 ) {
-        yywarn("line %zu: REPLACED %zu lines with %zu lines, "
-              "line count off by %d", mfile.lineno(),
-              nlines.before, nlines.after, nlines.delta());
-      }
-      int nnl = nlines.delta();
-      while( nnl-- > 0 ) {
-        static const char nl[] = "\n";
-        std::copy(nl, nl + 1, ofs);
-      }
-    }
     out.flush();
   }
   // end of file
@@ -1878,6 +1906,7 @@ cdftext::process_file( filespan_t mfile, int output, bool second_pass ) {
     std::copy(file_pop, file_pop + strlen(file_pop), ofs);
     out.flush();
   }
+  if( !included_files.empty() ) { --nfiles; };
 }
 
 std::list<span_t>
@@ -1893,12 +1922,30 @@ cdftext::segment_line( filespan_t& mfile ) {
     return output;
   }
 
+  /*
+   * If the replacement changes the number of lines in the replaced text, we
+   * need to reset the line number, because the next statement is on a
+   * different line in the manipulated text than in the original.  Before each
+   * replacement, set the original line number.  After each replacement, set
+   * the line number after the elided text on the next line.
+   */
   for( const replace_t& segment : pending ) {
     gcc_assert(mfile.cur <= segment.before.p);
     gcc_assert(segment.before.pend <= mfile.eodata);
 
+    struct { unsigned long ante, post; } lineno = {
+      gb4(mfile.lineno()), gb4(mfile.lineno() + segment.after.nlines())
+    };
+    char *directive = lineno.ante == lineno.post?
+      nullptr : xasprintf("\n#line %lu \"%s\"\n",
+                          lineno.ante, cobol_filename());
+
+    if( directive ) 
+      output.push_back( span_t(strlen(directive), directive) );
     output.push_back( span_t(mfile.cur, segment.before.p) );
     output.push_back( span_t(segment.after.p, segment.after.pend ) );
+    if( directive ) 
+      output.push_back( span_t(strlen(directive), directive) );
 
     mfile.cur = const_cast<char*>(segment.before.pend);
   }
@@ -1914,5 +1961,3 @@ cdftext::segment_line( filespan_t& mfile ) {
 
   return output;
 }
-
-//////// End of the cdf_text.h file