aboutsummaryrefslogtreecommitdiff
path: root/gcc/cobol/symbols.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/cobol/symbols.cc')
-rw-r--r--gcc/cobol/symbols.cc292
1 files changed, 200 insertions, 92 deletions
diff --git a/gcc/cobol/symbols.cc b/gcc/cobol/symbols.cc
index 2a299ce..07dc0e6 100644
--- a/gcc/cobol/symbols.cc
+++ b/gcc/cobol/symbols.cc
@@ -293,7 +293,7 @@ elementize( const cbl_field_t& field ) {
// Dubner did the following because he didn't feel like creating yet another
// cbl_field_t constructor that included the hardcoded encoding for the
// global special registers.
- sym.elem.field.codeset.encoding = iconv_CP1252_e;
+ sym.elem.field.codeset.set();
return sym;
}
@@ -511,6 +511,9 @@ symbol_elem_cmp( const void *K, const void *E )
case SymSpecial:
return special_pair_cmp(k->elem.special, e->elem.special)? 0 : 1;
break;
+ case SymLocale:
+ return strcasecmp(k->elem.locale.name, e->elem.locale.name);
+ break;
case SymAlphabet:
return strcasecmp(k->elem.alphabet.name, e->elem.alphabet.name);
break;
@@ -677,6 +680,22 @@ symbol_special( size_t program, const char name[] )
}
struct symbol_elem_t *
+symbol_locale( size_t program, const char name[] )
+{
+ cbl_locale_t locale(name);
+ assert(strlen(name) < sizeof locale.name);
+ strcpy(locale.name, name);
+
+ struct symbol_elem_t key(SymLocale, program), *e;
+ key.elem.locale = locale;
+
+ e = static_cast<struct symbol_elem_t *>(lfind( &key, symbols.elems,
+ &symbols.nelem, sizeof(key),
+ symbol_elem_cmp ) );
+ return e;
+}
+
+struct symbol_elem_t *
symbol_alphabet( size_t program, const char name[] )
{
cbl_alphabet_t alphabet(YYLTYPE(), custom_encoding_e); // cppcheck-suppress syntaxError
@@ -1510,11 +1529,11 @@ field_str( const cbl_field_t *field ) {
{
// Apparently we need to trace back the meaning of data.literal for
// field::type == FldNumericDisplay
- enc_from = DEFAULT_CHARMAP_SOURCE;
+ enc_from = DEFAULT_SOURCE_ENCODING;
}
init = __gg__iconverter(enc_from,
- DEFAULT_CHARMAP_SOURCE,
+ DEFAULT_SOURCE_ENCODING,
false_data,
field->data.capacity,
&charsout);
@@ -1522,12 +1541,12 @@ field_str( const cbl_field_t *field ) {
auto eoinit = init + strlen(init);
char *s = xasprintf("'%s'", init);
- // No NUL within the initial data.
+ // No NUL within the initial data.
auto ok = std::none_of( init, eoinit,
[]( char ch ) { return ch == '\0'; } );
assert(ok);
- // If any of the init are unprintable, provide a hex version.
+ // If any of the init are unprintable, provide a hex version.
if( ! std::all_of(init, eoinit, fisprint) ) {
if( is_elementary(field->type) && field->type != FldPointer ) {
const size_t len = strlen(s) + 8 + 2 * field->data.capacity;
@@ -1663,7 +1682,7 @@ symbols_alphabet_set( size_t program, const char name[]) {
//// // Define alphabets for codegen.
//// const cbl_alphabet_t *alphabet = nullptr;
//// bool supported = true;
-////
+////
//// std::for_each( symbols_begin(program), symbols_end(),
//// [&alphabet, &supported]( const auto& sym ) {
//// if( sym.type == SymAlphabet ) {
@@ -1679,7 +1698,7 @@ symbols_alphabet_set( size_t program, const char name[]) {
//// cbl_unimplemented("alphabet %qs (as %qs)", alphabet->name, encoding);
//// return false;
//// }
-////
+////
//// // Set collation sequence before parser_symbol_add.`
//// if( name ) {
//// symbol_elem_t *e = symbol_alphabet(program, name);
@@ -1906,38 +1925,46 @@ symbols_update( size_t first, bool parsed_ok ) {
}
}
- if( ! field->codeset.valid() ) {
- switch(field->type) {
- case FldForward:
- case FldInvalid:
- gcc_unreachable();
- case FldAlphaEdited:
- case FldAlphanumeric:
- case FldClass:
- case FldDisplay:
- case FldGroup:
- case FldLiteralA:
- case FldNumericDisplay:
- case FldNumericEdited:
+ if( ! field->codeset.consistent() ) {
+ if( ! field->codeset.valid() ) {
+ switch(field->type) {
+ case FldForward:
+ case FldInvalid:
+ gcc_unreachable();
+ case FldAlphaEdited:
+ case FldAlphanumeric:
+ case FldClass:
+ case FldDisplay:
+ case FldGroup:
+ case FldLiteralA:
+ case FldLiteralN:
+ case FldNumericDisplay:
+ case FldNumericEdited:
+ if( ! (field->has_attr(register_e) || field->has_attr(hex_encoded_e)) ) {
+ error_msg(symbol_field_location(field_index(field)),
+ "internal: %qs encoding not defined", field->name);
+ }
+ break;
+ case FldConditional:
+ case FldFloat:
+ case FldIndex:
+ case FldNumericBin5:
+ case FldNumericBinary:
+ case FldPacked:
+ case FldPointer:
+ case FldSwitch:
+ break;
+ }
+ } else {
if( ! (field->has_attr(register_e) || field->has_attr(hex_encoded_e)) ) {
error_msg(symbol_field_location(field_index(field)),
- "internal: %qs encoding not defined", field->name);
+ "internal: %qs encoding %qs inconsistent",
+ field->name,
+ cbl_alphabet_t::encoding_str(field->codeset.encoding) );
}
- break;
- case FldConditional:
- case FldFloat:
- case FldIndex:
- case FldLiteralN:
- case FldNumericBin5:
- case FldNumericBinary:
- case FldPacked:
- case FldPointer:
- case FldSwitch:
- break;
}
}
-
assert( ! field->is_typedef() );
if( parsed_ok ) parser_symbol_add(field);
@@ -2542,6 +2569,13 @@ symbol_file_add( size_t program, cbl_file_t *file ) {
}
symbol_elem_t *
+symbol_locale_add( size_t program, const cbl_locale_t *locale ) {
+ symbol_elem_t sym{ SymLocale, program };
+ sym.elem.locale = *locale;
+ return symbol_add(&sym);
+}
+
+symbol_elem_t *
symbol_alphabet_add( size_t program, const cbl_alphabet_t *alphabet ) {
symbol_elem_t sym{ SymAlphabet, program };
sym.elem.alphabet = *alphabet;
@@ -3202,19 +3236,56 @@ constant_of( size_t isym )
return field;
}
+cbl_locale_t::cbl_locale_t( const cbl_name_t name, const char iconv_name[] ) {
+ gcc_assert(strlen(name) < sizeof this->name);
+ strcpy(this->name, name);
+
+ if( iconv_name ) {
+ encoding = __gg__encoding_iconv_type(iconv_name);
+
+ strcpy(collation, "C");
+ // If the iconv_name is prefixed by langauge_COUNTRY (e.g. en_US), capture that.
+ auto pend = iconv_name + strlen(iconv_name);
+ auto p = std::find(iconv_name, pend, '.');
+ if( p < pend ) {
+ auto pend2 = std::copy(iconv_name, p, collation);
+ std::fill(pend2, collation + sizeof(collation), '\0');
+ iconv_name = ++p;
+ }
+ encoding = __gg__encoding_iconv_type(iconv_name);
+ }
+}
+
+cbl_alphabet_t::cbl_alphabet_t(const YYLTYPE& loc, size_t locale, cbl_name_t name )
+ : loc(loc)
+ , locale(locale)
+ , low_index(0)
+ , high_index(255)
+ , last_index(0)
+{
+ if( locale > 0 ) {
+ encoding = cbl_locale_of(symbol_at(locale))->encoding;
+ }
+ memset(collation_sequence, 0xFF, sizeof(collation_sequence));
+ if( name ) { // from Special-Names collation_sequence
+ assert(strlen(name) < sizeof(cbl_name_t));
+ strcpy(this->name, name);
+ }
+}
+
/*
* As parsed, the alphabet reflects the encoding of the source code. If the
* program uses a different encoding for alphanumeric, convert the alphabet to
- * that.
- *
+ * that.
+ *
* Because a custom alphabet is rare and occurs at most only once per program,
* we don't attempt to avoid re-encoding. "Conversion" of ASCII to ASCII is at
- * most 256 calls to iconv(3).
+ * most 256 calls to iconv(3).
*/
void
cbl_alphabet_t::reencode() {
- const unsigned char * const pend = alphabet + sizeof(alphabet);
+ const unsigned char * const pend = collation_sequence + sizeof(collation_sequence);
std::vector<char> tgt(256, (char)0xFF);
/* Keep copies of low_index and last_index for use in run-time as LOW-VALUE
@@ -3230,13 +3301,14 @@ cbl_alphabet_t::reencode() {
* a custom alphabet are from NIST, which of course are ASCII.
*/
const char *fromcode = __gg__encoding_iconv_name(CP1252_e);
- const char *tocode = __gg__encoding_iconv_name(current_encoding('A'));
+ const char *tocode =
+ __gg__encoding_iconv_name(current_encoding(display_encoding_e));
iconv_t cd = iconv_open(tocode, fromcode);
-
+
#if optimal_reencode
if( fromcode == tocode ) { // semantically
tgt.resize(0);
- return tgt; // Return empty vector; caller copies zero bytes.
+ return tgt; // Return empty vector; caller copies zero bytes.
}
#endif
@@ -3247,14 +3319,14 @@ cbl_alphabet_t::reencode() {
* that letter in the alphanumeric encoding, and set its collation position
* in that alphabet.
*/
- for( const unsigned char *p = alphabet; p < pend; p++ ) {
+ for( const unsigned char *p = collation_sequence; p < pend; p++ ) {
if( *p == 0xFF ) continue;
- unsigned char ch = p - alphabet;
+ unsigned char ch = p - collation_sequence;
unsigned char pos[8] = {};
size_t inbytesleft = 1, outbytesleft = sizeof(pos);
char *inbuf = reinterpret_cast<char*>(&ch),
*outbuf = reinterpret_cast<char*>(pos);
-
+
size_t n = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if( n == size_t(-1) ) {
@@ -3273,7 +3345,7 @@ cbl_alphabet_t::reencode() {
fromcode, ch, ch, n, tocode);
continue;
}
-
+
if( ch == low_index ) {
low_index = pos[0];
}
@@ -3283,21 +3355,21 @@ cbl_alphabet_t::reencode() {
if( ch == high_index ) {
high_index = pos[0];
}
-
+
tgt.at(pos[0]) = *p;
}
-
- std::copy(tgt.begin(), tgt.end(), alphabet);
+
+ std::copy(tgt.begin(), tgt.end(), collation_sequence);
}
bool
cbl_alphabet_t::assign( const YYLTYPE& loc, unsigned char ch, unsigned char high_value ) {
- if( alphabet[ch] == 0xFF || alphabet[ch] == high_value) {
- alphabet[ch] = high_value;
+ if( collation_sequence[ch] == 0xFF || collation_sequence[ch] == high_value) {
+ collation_sequence[ch] = high_value;
last_index = ch;
return true;
}
- auto taken = alphabet[ch];
+ auto taken = collation_sequence[ch];
error_msg(loc, "ALPHABET %s, character %<%c%> (X%'%x%') "
"in position %d already defined at position %d",
name,
@@ -3310,7 +3382,7 @@ cbl_alphabet_t::assign( const YYLTYPE& loc, unsigned char ch, unsigned char high
void
cbl_alphabet_t::also( const YYLTYPE& loc, size_t ch ) {
if( ch < 256 ) {
- alphabet[ch] = alphabet[last_index];
+ collation_sequence[ch] = collation_sequence[last_index];
if( ch == high_index ) high_index--;
return;
} // else it's a figurative constant ...
@@ -3323,20 +3395,20 @@ cbl_alphabet_t::also( const YYLTYPE& loc, size_t ch ) {
// last_index is already set; use it as the "last value before ALSO"
if( attr & low_value_e ) {
- alphabet[0] = alphabet[last_index];
+ collation_sequence[0] = collation_sequence[last_index];
return;
}
if( attr & high_value_e ) {
- alphabet[high_index--] = alphabet[last_index];
+ collation_sequence[high_index--] = collation_sequence[last_index];
return;
}
if( attr & (space_value_e|quote_value_e) ) {
ch = field->data.initial[0];
- alphabet[ch] = alphabet[last_index];
+ collation_sequence[ch] = collation_sequence[last_index];
return;
}
if( attr & (zero_value_e) ) {
- alphabet[0] = alphabet[last_index];
+ collation_sequence[0] = collation_sequence[last_index];
error_msg(loc, "ALSO value '%s' is unknown", field->name);
return;
}
@@ -3448,18 +3520,33 @@ new_literal_add( const char initial[], uint32_t len,
}
else
{
- static char empty[2] = "\0";
field = new_temporary_impl(FldLiteralA);
field->attr |= attr;
- field->data.initial = len > 0? initial : empty;
+
+ if(len == 0)
+ {
+ // This will cover UTF-32, should that arise.
+ size_t nbytes = 4;
+ char *init = static_cast<char *>(xmalloc(nbytes));
+ memset(init, 0, nbytes);
+ field->data.initial = init;
+ }
+ if(len)
+ {
+ char *init = static_cast<char *>(xmalloc(len+4));
+ memcpy(init, initial, len);
+ memset(init+len, 0, 4);
+ field->data.initial = init;
+ }
field->data.capacity = len;
}
if( ! field->has_attr(hex_encoded_e) ) {
- field->codeset.set(encoding);
- if( ! field->internalize() ) {
- ERROR_FIELD(field, "inconsistent string literal encoding for '%s'", initial);
+ // If the literal bore a prefix, set the encoding,
+ if( encoding != cbl_field_t::codeset_t::source_encoding->type ) {
+ field->codeset.set(encoding);
}
+ field->internalize();
}
static size_t literal_count = 1;
@@ -3595,6 +3682,14 @@ new_alphanumeric( size_t capacity, const cbl_name_t name = nullptr ) {
extern os_locale_t os_locale;
+const encodings_t cbl_field_t::codeset_t::source_encodings[2] = {
+ { false, iconv_UTF_8_e, "UTF-8" },
+ { true, iconv_CP1252_e, "CP1252" },
+};
+const encodings_t * cbl_field_t::codeset_t::source_encoding = {
+ cbl_field_t::codeset_t::source_encodings
+};
+
const encodings_t cbl_field_t::codeset_t::standard_internal = {
true, iconv_CP1252_e, "CP1252"
};
@@ -3603,7 +3698,7 @@ const encodings_t cbl_field_t::codeset_t::standard_internal = {
cbl_field_t *
new_temporary( enum cbl_field_type_t type, const char *initial, bool is_signed ) {
const bool force_unsigned = type == FldNumericBin5 && ! is_signed;
-
+
if( ! initial && ! force_unsigned ) {
assert( ! is_literal(type) ); // Literal type must have literal value.
return temporaries.acquire(type, initial);
@@ -3719,29 +3814,26 @@ cbl_field_t::is_ascii() const {
* never reverts.
*/
-static const char *
-guess_encoding() {
- static const char *fromcode;
-
- if( ! fromcode ) {
- return fromcode = os_locale.assumed;
- }
-
- if( fromcode == os_locale.assumed ) {
- fromcode = os_locale.codeset;
- if( 0 != strcmp(fromcode, "C") ) { // anything but that
- return fromcode;
- }
- }
-
- return standard_internal.name;
-}
-
const char *
cbl_field_t::internalize() {
- static const char *fromcode = guess_encoding();
+ /* The purpose of this routine is to return a nul-terminated string which
+ is data.initial converted from the source-code characters to the
+ codeset.encoding characters.
+
+ The contract between this routine and the routines that call it is that
+ for alphanumeric types, data.initial shall have the same number of
+ characters as will be needed to fill data.capacity.
+
+ Be aware that for PIC X(32) Z"foo", there are the characters "foo",
+ followed by a NUL, and then 28 spaces to fill it out. It turns out that
+ iconv, given a character count of 32, converts all 32, including the
+ embedded NUL. So, that case works even through strlen(initial) is
+ smaller than the length of initial, which is the same as capacity.
+ */
+
+ static const char *fromcode = codeset.source_encodings[0].name;
static const size_t noconv = size_t(-1);
- static std::map<std::string, iconv_t> tocodes;
+ static std::unordered_map<std::string, iconv_t> tocodes;
if( ! codeset.valid() ) {
dbgmsg("%s:%d: not converting %s", __func__, __LINE__, data.initial);
@@ -3769,20 +3861,33 @@ cbl_field_t::internalize() {
assert(0 == strlen(data.initial));
return data.initial;
}
- if( holds_ascii() && is_ascii() ) return data.initial;
+ if( holds_ascii() && is_ascii() ) {
+ if( type != FldNumericEdited ) {
+ if( ! data.initial_within_capacity() ) {
+ ERROR_FIELD(this, "%s %s VALUE %qs of %zu exceeds size %u",
+ cbl_field_t::level_str(level), name, data.initial,
+ strlen(data.initial), data.capacity );
+ }
+ }
+ return data.initial;
+ }
assert(data.capacity > 0);
// The final 2 bytes of the output are "!\0". It's a debugging sentinel.
size_t n;
size_t inbytesleft = data.capacity;
size_t outbytesleft = inbytesleft;
- char *in = const_cast<char*>(data.initial);
- char *out = static_cast<char*>( xcalloc(1, outbytesleft + 2) ), *output = out;
if( !is_literal(this) && inbytesleft < strlen(data.initial) ) {
inbytesleft = strlen(data.initial);
}
+ if( type == FldNumericEdited ) {
+ outbytesleft = inbytesleft;
+ }
const unsigned int in_len = inbytesleft;
+ char *in = const_cast<char*>(data.initial);
+ char *out = static_cast<char*>( xcalloc(1, outbytesleft + 2) ), *output = out;
+
assert(fromcode != tocode);
/*
@@ -3799,8 +3904,9 @@ cbl_field_t::internalize() {
do {
if( (n = iconv( cd, &in, &inbytesleft, &out, &outbytesleft)) == noconv ) {
- if( fromcode == os_locale.assumed ) {
- fromcode = standard_internal.name;
+ if( fromcode == codeset.source_encodings[0].name ) {
+ codeset.source_encoding = &codeset.source_encodings[1];
+ fromcode = codeset.source_encoding->name;
tocodes.clear();
cd = tocodes[toname] = iconv_open(tocode, fromcode);
dbgmsg("%s: trying input encoding %s", __func__, fromcode);
@@ -3813,7 +3919,7 @@ cbl_field_t::internalize() {
if( n == noconv ) {
size_t i = in_len - inbytesleft;
- yywarn("failed to encode %s %qs as %s (%zu of %u bytes left)",
+ yyerror("failed to encode %s %qs as %s (%zu of %u bytes left)",
fromcode, data.initial + i, tocode, inbytesleft, in_len);
if( false ) return NULL;
return data.initial;
@@ -3821,7 +3927,7 @@ cbl_field_t::internalize() {
if( 0 < inbytesleft ) {
// data.capacity + inbytesleft is not correct if the remaining portion has
- // multibyte characters. But the fact reamins that the VALUE is too big.
+ // multibyte characters. But the fact remains that the VALUE is too big.
ERROR_FIELD(this, "%s %s VALUE '%s' requires %zu bytes for size %u",
cbl_field_t::level_str(level), name, data.initial,
data.capacity + inbytesleft, data.capacity );
@@ -3829,7 +3935,7 @@ cbl_field_t::internalize() {
// Replace data.initial only if iconv output differs.
if( 0 != memcmp(data.initial, output, out - output) ) {
- assert(out <= output + data.capacity);
+ assert(out <= output + data.capacity || type == FldNumericEdited);
dbgmsg("%s: converted '%.*s' to %s",
__func__, data.capacity, data.initial, tocode);
struct localspace_t {
@@ -3858,14 +3964,16 @@ cbl_field_t::internalize() {
data.capacity = out - output; // trailing '!' will be overwritten
}
// Pad with trailing blanks, tacking a '!' on the end.
- for( const char *eout = output + data.capacity;
+ for( const char *eout = output + data.capacity;
out < eout;
out += spc.len ) {
memcpy(out, spc.space, spc.len);
}
- out[0] = '!';
+ // Numeric literal strings may have leading zeros, making their length
+ // longer than their capacity.
+ out[0] = type == FldLiteralN? '\0' : '!';
assert(out[1] == '\0');
- free(const_cast<char*>(data.initial));
+ data.orig = data.initial;
data.initial = output;
} else {
free(output);