diff options
Diffstat (limited to 'gcc/cobol/symbols.cc')
| -rw-r--r-- | gcc/cobol/symbols.cc | 292 |
1 files changed, 200 insertions, 92 deletions
diff --git a/gcc/cobol/symbols.cc b/gcc/cobol/symbols.cc index 2a299ce..07dc0e6 100644 --- a/gcc/cobol/symbols.cc +++ b/gcc/cobol/symbols.cc @@ -293,7 +293,7 @@ elementize( const cbl_field_t& field ) { // Dubner did the following because he didn't feel like creating yet another // cbl_field_t constructor that included the hardcoded encoding for the // global special registers. - sym.elem.field.codeset.encoding = iconv_CP1252_e; + sym.elem.field.codeset.set(); return sym; } @@ -511,6 +511,9 @@ symbol_elem_cmp( const void *K, const void *E ) case SymSpecial: return special_pair_cmp(k->elem.special, e->elem.special)? 0 : 1; break; + case SymLocale: + return strcasecmp(k->elem.locale.name, e->elem.locale.name); + break; case SymAlphabet: return strcasecmp(k->elem.alphabet.name, e->elem.alphabet.name); break; @@ -677,6 +680,22 @@ symbol_special( size_t program, const char name[] ) } struct symbol_elem_t * +symbol_locale( size_t program, const char name[] ) +{ + cbl_locale_t locale(name); + assert(strlen(name) < sizeof locale.name); + strcpy(locale.name, name); + + struct symbol_elem_t key(SymLocale, program), *e; + key.elem.locale = locale; + + e = static_cast<struct symbol_elem_t *>(lfind( &key, symbols.elems, + &symbols.nelem, sizeof(key), + symbol_elem_cmp ) ); + return e; +} + +struct symbol_elem_t * symbol_alphabet( size_t program, const char name[] ) { cbl_alphabet_t alphabet(YYLTYPE(), custom_encoding_e); // cppcheck-suppress syntaxError @@ -1510,11 +1529,11 @@ field_str( const cbl_field_t *field ) { { // Apparently we need to trace back the meaning of data.literal for // field::type == FldNumericDisplay - enc_from = DEFAULT_CHARMAP_SOURCE; + enc_from = DEFAULT_SOURCE_ENCODING; } init = __gg__iconverter(enc_from, - DEFAULT_CHARMAP_SOURCE, + DEFAULT_SOURCE_ENCODING, false_data, field->data.capacity, &charsout); @@ -1522,12 +1541,12 @@ field_str( const cbl_field_t *field ) { auto eoinit = init + strlen(init); char *s = xasprintf("'%s'", init); - // No NUL within the initial data. + // No NUL within the initial data. auto ok = std::none_of( init, eoinit, []( char ch ) { return ch == '\0'; } ); assert(ok); - // If any of the init are unprintable, provide a hex version. + // If any of the init are unprintable, provide a hex version. if( ! std::all_of(init, eoinit, fisprint) ) { if( is_elementary(field->type) && field->type != FldPointer ) { const size_t len = strlen(s) + 8 + 2 * field->data.capacity; @@ -1663,7 +1682,7 @@ symbols_alphabet_set( size_t program, const char name[]) { //// // Define alphabets for codegen. //// const cbl_alphabet_t *alphabet = nullptr; //// bool supported = true; -//// +//// //// std::for_each( symbols_begin(program), symbols_end(), //// [&alphabet, &supported]( const auto& sym ) { //// if( sym.type == SymAlphabet ) { @@ -1679,7 +1698,7 @@ symbols_alphabet_set( size_t program, const char name[]) { //// cbl_unimplemented("alphabet %qs (as %qs)", alphabet->name, encoding); //// return false; //// } -//// +//// //// // Set collation sequence before parser_symbol_add.` //// if( name ) { //// symbol_elem_t *e = symbol_alphabet(program, name); @@ -1906,38 +1925,46 @@ symbols_update( size_t first, bool parsed_ok ) { } } - if( ! field->codeset.valid() ) { - switch(field->type) { - case FldForward: - case FldInvalid: - gcc_unreachable(); - case FldAlphaEdited: - case FldAlphanumeric: - case FldClass: - case FldDisplay: - case FldGroup: - case FldLiteralA: - case FldNumericDisplay: - case FldNumericEdited: + if( ! field->codeset.consistent() ) { + if( ! field->codeset.valid() ) { + switch(field->type) { + case FldForward: + case FldInvalid: + gcc_unreachable(); + case FldAlphaEdited: + case FldAlphanumeric: + case FldClass: + case FldDisplay: + case FldGroup: + case FldLiteralA: + case FldLiteralN: + case FldNumericDisplay: + case FldNumericEdited: + if( ! (field->has_attr(register_e) || field->has_attr(hex_encoded_e)) ) { + error_msg(symbol_field_location(field_index(field)), + "internal: %qs encoding not defined", field->name); + } + break; + case FldConditional: + case FldFloat: + case FldIndex: + case FldNumericBin5: + case FldNumericBinary: + case FldPacked: + case FldPointer: + case FldSwitch: + break; + } + } else { if( ! (field->has_attr(register_e) || field->has_attr(hex_encoded_e)) ) { error_msg(symbol_field_location(field_index(field)), - "internal: %qs encoding not defined", field->name); + "internal: %qs encoding %qs inconsistent", + field->name, + cbl_alphabet_t::encoding_str(field->codeset.encoding) ); } - break; - case FldConditional: - case FldFloat: - case FldIndex: - case FldLiteralN: - case FldNumericBin5: - case FldNumericBinary: - case FldPacked: - case FldPointer: - case FldSwitch: - break; } } - assert( ! field->is_typedef() ); if( parsed_ok ) parser_symbol_add(field); @@ -2542,6 +2569,13 @@ symbol_file_add( size_t program, cbl_file_t *file ) { } symbol_elem_t * +symbol_locale_add( size_t program, const cbl_locale_t *locale ) { + symbol_elem_t sym{ SymLocale, program }; + sym.elem.locale = *locale; + return symbol_add(&sym); +} + +symbol_elem_t * symbol_alphabet_add( size_t program, const cbl_alphabet_t *alphabet ) { symbol_elem_t sym{ SymAlphabet, program }; sym.elem.alphabet = *alphabet; @@ -3202,19 +3236,56 @@ constant_of( size_t isym ) return field; } +cbl_locale_t::cbl_locale_t( const cbl_name_t name, const char iconv_name[] ) { + gcc_assert(strlen(name) < sizeof this->name); + strcpy(this->name, name); + + if( iconv_name ) { + encoding = __gg__encoding_iconv_type(iconv_name); + + strcpy(collation, "C"); + // If the iconv_name is prefixed by langauge_COUNTRY (e.g. en_US), capture that. + auto pend = iconv_name + strlen(iconv_name); + auto p = std::find(iconv_name, pend, '.'); + if( p < pend ) { + auto pend2 = std::copy(iconv_name, p, collation); + std::fill(pend2, collation + sizeof(collation), '\0'); + iconv_name = ++p; + } + encoding = __gg__encoding_iconv_type(iconv_name); + } +} + +cbl_alphabet_t::cbl_alphabet_t(const YYLTYPE& loc, size_t locale, cbl_name_t name ) + : loc(loc) + , locale(locale) + , low_index(0) + , high_index(255) + , last_index(0) +{ + if( locale > 0 ) { + encoding = cbl_locale_of(symbol_at(locale))->encoding; + } + memset(collation_sequence, 0xFF, sizeof(collation_sequence)); + if( name ) { // from Special-Names collation_sequence + assert(strlen(name) < sizeof(cbl_name_t)); + strcpy(this->name, name); + } +} + /* * As parsed, the alphabet reflects the encoding of the source code. If the * program uses a different encoding for alphanumeric, convert the alphabet to - * that. - * + * that. + * * Because a custom alphabet is rare and occurs at most only once per program, * we don't attempt to avoid re-encoding. "Conversion" of ASCII to ASCII is at - * most 256 calls to iconv(3). + * most 256 calls to iconv(3). */ void cbl_alphabet_t::reencode() { - const unsigned char * const pend = alphabet + sizeof(alphabet); + const unsigned char * const pend = collation_sequence + sizeof(collation_sequence); std::vector<char> tgt(256, (char)0xFF); /* Keep copies of low_index and last_index for use in run-time as LOW-VALUE @@ -3230,13 +3301,14 @@ cbl_alphabet_t::reencode() { * a custom alphabet are from NIST, which of course are ASCII. */ const char *fromcode = __gg__encoding_iconv_name(CP1252_e); - const char *tocode = __gg__encoding_iconv_name(current_encoding('A')); + const char *tocode = + __gg__encoding_iconv_name(current_encoding(display_encoding_e)); iconv_t cd = iconv_open(tocode, fromcode); - + #if optimal_reencode if( fromcode == tocode ) { // semantically tgt.resize(0); - return tgt; // Return empty vector; caller copies zero bytes. + return tgt; // Return empty vector; caller copies zero bytes. } #endif @@ -3247,14 +3319,14 @@ cbl_alphabet_t::reencode() { * that letter in the alphanumeric encoding, and set its collation position * in that alphabet. */ - for( const unsigned char *p = alphabet; p < pend; p++ ) { + for( const unsigned char *p = collation_sequence; p < pend; p++ ) { if( *p == 0xFF ) continue; - unsigned char ch = p - alphabet; + unsigned char ch = p - collation_sequence; unsigned char pos[8] = {}; size_t inbytesleft = 1, outbytesleft = sizeof(pos); char *inbuf = reinterpret_cast<char*>(&ch), *outbuf = reinterpret_cast<char*>(pos); - + size_t n = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if( n == size_t(-1) ) { @@ -3273,7 +3345,7 @@ cbl_alphabet_t::reencode() { fromcode, ch, ch, n, tocode); continue; } - + if( ch == low_index ) { low_index = pos[0]; } @@ -3283,21 +3355,21 @@ cbl_alphabet_t::reencode() { if( ch == high_index ) { high_index = pos[0]; } - + tgt.at(pos[0]) = *p; } - - std::copy(tgt.begin(), tgt.end(), alphabet); + + std::copy(tgt.begin(), tgt.end(), collation_sequence); } bool cbl_alphabet_t::assign( const YYLTYPE& loc, unsigned char ch, unsigned char high_value ) { - if( alphabet[ch] == 0xFF || alphabet[ch] == high_value) { - alphabet[ch] = high_value; + if( collation_sequence[ch] == 0xFF || collation_sequence[ch] == high_value) { + collation_sequence[ch] = high_value; last_index = ch; return true; } - auto taken = alphabet[ch]; + auto taken = collation_sequence[ch]; error_msg(loc, "ALPHABET %s, character %<%c%> (X%'%x%') " "in position %d already defined at position %d", name, @@ -3310,7 +3382,7 @@ cbl_alphabet_t::assign( const YYLTYPE& loc, unsigned char ch, unsigned char high void cbl_alphabet_t::also( const YYLTYPE& loc, size_t ch ) { if( ch < 256 ) { - alphabet[ch] = alphabet[last_index]; + collation_sequence[ch] = collation_sequence[last_index]; if( ch == high_index ) high_index--; return; } // else it's a figurative constant ... @@ -3323,20 +3395,20 @@ cbl_alphabet_t::also( const YYLTYPE& loc, size_t ch ) { // last_index is already set; use it as the "last value before ALSO" if( attr & low_value_e ) { - alphabet[0] = alphabet[last_index]; + collation_sequence[0] = collation_sequence[last_index]; return; } if( attr & high_value_e ) { - alphabet[high_index--] = alphabet[last_index]; + collation_sequence[high_index--] = collation_sequence[last_index]; return; } if( attr & (space_value_e|quote_value_e) ) { ch = field->data.initial[0]; - alphabet[ch] = alphabet[last_index]; + collation_sequence[ch] = collation_sequence[last_index]; return; } if( attr & (zero_value_e) ) { - alphabet[0] = alphabet[last_index]; + collation_sequence[0] = collation_sequence[last_index]; error_msg(loc, "ALSO value '%s' is unknown", field->name); return; } @@ -3448,18 +3520,33 @@ new_literal_add( const char initial[], uint32_t len, } else { - static char empty[2] = "\0"; field = new_temporary_impl(FldLiteralA); field->attr |= attr; - field->data.initial = len > 0? initial : empty; + + if(len == 0) + { + // This will cover UTF-32, should that arise. + size_t nbytes = 4; + char *init = static_cast<char *>(xmalloc(nbytes)); + memset(init, 0, nbytes); + field->data.initial = init; + } + if(len) + { + char *init = static_cast<char *>(xmalloc(len+4)); + memcpy(init, initial, len); + memset(init+len, 0, 4); + field->data.initial = init; + } field->data.capacity = len; } if( ! field->has_attr(hex_encoded_e) ) { - field->codeset.set(encoding); - if( ! field->internalize() ) { - ERROR_FIELD(field, "inconsistent string literal encoding for '%s'", initial); + // If the literal bore a prefix, set the encoding, + if( encoding != cbl_field_t::codeset_t::source_encoding->type ) { + field->codeset.set(encoding); } + field->internalize(); } static size_t literal_count = 1; @@ -3595,6 +3682,14 @@ new_alphanumeric( size_t capacity, const cbl_name_t name = nullptr ) { extern os_locale_t os_locale; +const encodings_t cbl_field_t::codeset_t::source_encodings[2] = { + { false, iconv_UTF_8_e, "UTF-8" }, + { true, iconv_CP1252_e, "CP1252" }, +}; +const encodings_t * cbl_field_t::codeset_t::source_encoding = { + cbl_field_t::codeset_t::source_encodings +}; + const encodings_t cbl_field_t::codeset_t::standard_internal = { true, iconv_CP1252_e, "CP1252" }; @@ -3603,7 +3698,7 @@ const encodings_t cbl_field_t::codeset_t::standard_internal = { cbl_field_t * new_temporary( enum cbl_field_type_t type, const char *initial, bool is_signed ) { const bool force_unsigned = type == FldNumericBin5 && ! is_signed; - + if( ! initial && ! force_unsigned ) { assert( ! is_literal(type) ); // Literal type must have literal value. return temporaries.acquire(type, initial); @@ -3719,29 +3814,26 @@ cbl_field_t::is_ascii() const { * never reverts. */ -static const char * -guess_encoding() { - static const char *fromcode; - - if( ! fromcode ) { - return fromcode = os_locale.assumed; - } - - if( fromcode == os_locale.assumed ) { - fromcode = os_locale.codeset; - if( 0 != strcmp(fromcode, "C") ) { // anything but that - return fromcode; - } - } - - return standard_internal.name; -} - const char * cbl_field_t::internalize() { - static const char *fromcode = guess_encoding(); + /* The purpose of this routine is to return a nul-terminated string which + is data.initial converted from the source-code characters to the + codeset.encoding characters. + + The contract between this routine and the routines that call it is that + for alphanumeric types, data.initial shall have the same number of + characters as will be needed to fill data.capacity. + + Be aware that for PIC X(32) Z"foo", there are the characters "foo", + followed by a NUL, and then 28 spaces to fill it out. It turns out that + iconv, given a character count of 32, converts all 32, including the + embedded NUL. So, that case works even through strlen(initial) is + smaller than the length of initial, which is the same as capacity. + */ + + static const char *fromcode = codeset.source_encodings[0].name; static const size_t noconv = size_t(-1); - static std::map<std::string, iconv_t> tocodes; + static std::unordered_map<std::string, iconv_t> tocodes; if( ! codeset.valid() ) { dbgmsg("%s:%d: not converting %s", __func__, __LINE__, data.initial); @@ -3769,20 +3861,33 @@ cbl_field_t::internalize() { assert(0 == strlen(data.initial)); return data.initial; } - if( holds_ascii() && is_ascii() ) return data.initial; + if( holds_ascii() && is_ascii() ) { + if( type != FldNumericEdited ) { + if( ! data.initial_within_capacity() ) { + ERROR_FIELD(this, "%s %s VALUE %qs of %zu exceeds size %u", + cbl_field_t::level_str(level), name, data.initial, + strlen(data.initial), data.capacity ); + } + } + return data.initial; + } assert(data.capacity > 0); // The final 2 bytes of the output are "!\0". It's a debugging sentinel. size_t n; size_t inbytesleft = data.capacity; size_t outbytesleft = inbytesleft; - char *in = const_cast<char*>(data.initial); - char *out = static_cast<char*>( xcalloc(1, outbytesleft + 2) ), *output = out; if( !is_literal(this) && inbytesleft < strlen(data.initial) ) { inbytesleft = strlen(data.initial); } + if( type == FldNumericEdited ) { + outbytesleft = inbytesleft; + } const unsigned int in_len = inbytesleft; + char *in = const_cast<char*>(data.initial); + char *out = static_cast<char*>( xcalloc(1, outbytesleft + 2) ), *output = out; + assert(fromcode != tocode); /* @@ -3799,8 +3904,9 @@ cbl_field_t::internalize() { do { if( (n = iconv( cd, &in, &inbytesleft, &out, &outbytesleft)) == noconv ) { - if( fromcode == os_locale.assumed ) { - fromcode = standard_internal.name; + if( fromcode == codeset.source_encodings[0].name ) { + codeset.source_encoding = &codeset.source_encodings[1]; + fromcode = codeset.source_encoding->name; tocodes.clear(); cd = tocodes[toname] = iconv_open(tocode, fromcode); dbgmsg("%s: trying input encoding %s", __func__, fromcode); @@ -3813,7 +3919,7 @@ cbl_field_t::internalize() { if( n == noconv ) { size_t i = in_len - inbytesleft; - yywarn("failed to encode %s %qs as %s (%zu of %u bytes left)", + yyerror("failed to encode %s %qs as %s (%zu of %u bytes left)", fromcode, data.initial + i, tocode, inbytesleft, in_len); if( false ) return NULL; return data.initial; @@ -3821,7 +3927,7 @@ cbl_field_t::internalize() { if( 0 < inbytesleft ) { // data.capacity + inbytesleft is not correct if the remaining portion has - // multibyte characters. But the fact reamins that the VALUE is too big. + // multibyte characters. But the fact remains that the VALUE is too big. ERROR_FIELD(this, "%s %s VALUE '%s' requires %zu bytes for size %u", cbl_field_t::level_str(level), name, data.initial, data.capacity + inbytesleft, data.capacity ); @@ -3829,7 +3935,7 @@ cbl_field_t::internalize() { // Replace data.initial only if iconv output differs. if( 0 != memcmp(data.initial, output, out - output) ) { - assert(out <= output + data.capacity); + assert(out <= output + data.capacity || type == FldNumericEdited); dbgmsg("%s: converted '%.*s' to %s", __func__, data.capacity, data.initial, tocode); struct localspace_t { @@ -3858,14 +3964,16 @@ cbl_field_t::internalize() { data.capacity = out - output; // trailing '!' will be overwritten } // Pad with trailing blanks, tacking a '!' on the end. - for( const char *eout = output + data.capacity; + for( const char *eout = output + data.capacity; out < eout; out += spc.len ) { memcpy(out, spc.space, spc.len); } - out[0] = '!'; + // Numeric literal strings may have leading zeros, making their length + // longer than their capacity. + out[0] = type == FldLiteralN? '\0' : '!'; assert(out[1] == '\0'); - free(const_cast<char*>(data.initial)); + data.orig = data.initial; data.initial = output; } else { free(output); |
