diff options
author | Corinna Vinschen <corinna@vinschen.de> | 2009-03-24 12:18:34 +0000 |
---|---|---|
committer | Corinna Vinschen <corinna@vinschen.de> | 2009-03-24 12:18:34 +0000 |
commit | 161211d186a16e4f090b8b3c63040f0b9aee25d4 (patch) | |
tree | 4ac0e1154417f3b0119ba79407a8c5687d96bf83 /winsup/cygwin/strfuncs.cc | |
parent | 6a32d500a9d601b4f25cee0e1ec6b2ac5195a7e9 (diff) | |
download | newlib-161211d186a16e4f090b8b3c63040f0b9aee25d4.zip newlib-161211d186a16e4f090b8b3c63040f0b9aee25d4.tar.gz newlib-161211d186a16e4f090b8b3c63040f0b9aee25d4.tar.bz2 |
* ctype.cc (_CTYPE_DATA_0_127): Add _B class to TAB character.
(__ctype_default): New character class array for default ASCII
character set.
(__ctype_iso): New array of character class array for ISO charsets.
(__ctype_cp): Ditto for singlebyte Windows codepages.
(tolower): Implement as distinct function to support any singlebyte
charset.
(toupper): Ditto.
(__set_ctype): New function to copy singlebyte character classes
corresponding to current charset to ctype_b array.
Align copyright text to upstream.
* dcrt0.cc (dll_crt0_1): Reset current locale to "C" per POSIX.
* environ.cc (set_file_api_mode): Remove.
(codepage_init): Remove.
(parse_thing): Remove "codepage" setting.
(environ_init): Set locale according to environment settings, or
to current codepage, before converting environment to multibyte.
* fhandler.h (fhandler_console::write_replacement_char): Drop argument.
* fhandler_console.cc (dev_console::str_to_con): Call sys_cp_mbstowcs
rather than MultiByteToWideChar.
(fhandler_console::write_replacement_char): Always print a funny
half filled square if a character isn't in the current charset.
(fhandler_console::write_normal): Convert to using __mbtowc
rather than next_char.
* fork.cc (frok::child): Drop call to set_file_api_mode.
* globals.cc (enum codepage_type) Remove.
(current_codepage): Remove.
* miscfuncs.cc (cygwin_wcslwr): Unused, dangerous. Remove.
(cygwin_wcsupr): Ditto.
(is_cp_multibyte): Remove.
(next_char): Remove.
* miscfuncs.h (is_cp_multibyte): Drop declaration.
(next_char): Ditto.
* strfuncs.cc (get_cp): Remove.
(__db_wctomb): New function to implement _wctomb_r functionality for
doublebyte charsets using WideCharToMultiByte.
(__sjis_wctomb): New function to replace unusable newlib function.
(__jis_wctomb): Ditto.
(__eucjp_wctomb): Ditto.
(__gbk_wctomb): New function.
(__kr_wctomb): Ditto.
(__big5_wctomb): Ditto.
(__db_mbtowc): New function to implement _mbtowc_r functionality for
doublebyte charsets using MultiByteToWideChar.
(__sjis_mbtowc): New function to replace unusable newlib function.
(__jis_mbtowc): Ditto.
(__eucjp_mbtowc): Ditto.
(__gbk_mbtowc): New function.
(__kr_mbtowc): New function
(__big5_mbtowc): New function
(__set_charset_from_codepage): New function.
(sys_wcstombs): Reimplement, basically using same wide char to multibyte
conversion as newlib's application level functions. Plus extras.
Add lengthy comment to explain. Change return type to size_t.
(sys_wcstombs_alloc): Just use sys_wcstombs. Change return type to
size_t.
(sys_cp_mbstowcs): Replace sys_mbstowcs, take additional codepage
argument. Explain why. Change return type to size_t.
(sys_mbstowcs_alloc): Just use sys_mbstowcs. Change return type to
size_t.
* wchar.h: Declare internal functions implemented in strfuncs.cc.
(wcscasecmp): Remove.
(wcsncasecmp): Remove.
(wcslwr): Remove.
(wcsupr): Remove.
* winsup.h (codepage_init): Remove declaration.
(get_cp): Ditto.
(sys_wcstombs): Align declaration to new implementation.
(sys_wcstombs_alloc): Ditto.
(sys_cp_mbstowcs): Add declaration.
(sys_mbstowcs): Define as inline function.
(sys_mbstowcs_alloc): Align declaration to new implementation.
(set_file_api_mode): Remove declaration.
* include/ctype.h (isblank): Redefine to use _B character class.
(toupper): Remove ASCII-only definition.
(tolower): Ditto.
Diffstat (limited to 'winsup/cygwin/strfuncs.cc')
-rw-r--r-- | winsup/cygwin/strfuncs.cc | 489 |
1 files changed, 436 insertions, 53 deletions
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index e051cd7..5a9dd7d 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -20,45 +20,356 @@ details. */ #include "fhandler.h" #include "dtable.h" #include "cygheap.h" +#include "tls_pbuf.h" -UINT -get_cp () +/* The SJIS, JIS and EUCJP conversion in newlib does not use UTF as + wchar_t character representation. That's unfortunate for us since + we require UTF for the OS. What we do here is to have our own + implementation of the base functions for the conversion using + the MulitByteToWideChar/WideCharToMultiByte functions. */ + +/* GBK, CP949, and Big5 conversions are not available so far in newlib. */ + +static int +__db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp) +{ + if (s == NULL) + return 0; + + if (wchar < 0x80) + { + *s = (char) wchar; + return 1; + } + + BOOL def_used = false; + int ret = WideCharToMultiByte (cp, cp > 50000 ? 0 : WC_NO_BEST_FIT_CHARS, + &wchar, 1, s, MB_CUR_MAX, NULL, &def_used); + if (ret > 0 && !def_used) + return ret; + + r->_errno = EILSEQ; + return -1; +} + +extern "C" int +__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) { - if (!active_codepage) - codepage_init ("ansi"); - return active_codepage; + return __db_wctomb (r,s, wchar, 932); } -/* tlen is always treated as the maximum buffer size, including the '\0' - character. sys_wcstombs will always return a 0-terminated result, no - matter what. */ -int __stdcall -sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen) +extern "C" int +__jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) { + return __db_wctomb (r,s, wchar, 50220); +} + +extern "C" int +__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 51932); +} + +extern "C" int +__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 936); +} + +extern "C" int +__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 949); +} + +extern "C" int +__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 950); +} + +static int +__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + UINT cp, mbstate_t *state) +{ + wchar_t dummy; + char buf[2]; int ret; + + if (pwc == NULL) + pwc = &dummy; - /* Convert UNICODE private use area. Reverse functionality (only for - path names) is transform_chars in path.cc. */ - if (slen < 0) - slen = wcslen (src) + 1; - WCHAR sbuf[slen]; - memcpy (sbuf, src, slen * sizeof (WCHAR)); - const unsigned char *end = (unsigned char *) (sbuf + slen); - for (unsigned char *s = ((unsigned char *) sbuf) + 1; s < end; - s += sizeof (WCHAR)) - if (*s == 0xf0) - *s = 0; - ret = WideCharToMultiByte (get_cp (), 0, sbuf, slen, tgt, tlen, NULL, NULL); - if (ret && tgt) + if (s == NULL) + return 0; /* not state-dependent */ + + if (n == 0) + return -2; + + if (state->__count == 0) + { + if (*(unsigned char *) s < 0x80) + { + *pwc = *(unsigned char *) s; + return *s ? 1 : 0; + } + ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + s, 2, pwc, 1); + if (ret) + return *s ? 2 : 0; + if (n == 1) + { + state->__count = 1; + state->__value.__wchb[0] = *s; + return -2; + } + else + { + /* These Win32 functions are really crappy. Assuming n is 2 + but the first byte is a singlebyte charcode, the function + does not convert that byte and return 1, rather it just + returns 0. So, what we do here is to check if the first + byte returns a valid value... */ + ret = MultiByteToWideChar (cp, + cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + s, 1, pwc, 1); + if (ret) + return *s ? 1 : 0; + } + r->_errno = EILSEQ; + return -1; + } + if (!*s) + return -2; + buf[0] = state->__value.__wchb[0]; + buf[1] = *s; + ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + buf, 2, pwc, 1); + if (!ret) { - ret = (ret < tlen) ? ret : tlen - 1; - tgt[ret] = '\0'; + r->_errno = EILSEQ; + return -1; } return ret; } +extern "C" int +__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 932, state); +} + +extern "C" int +__jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 50220, state); +} + +extern "C" int +__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 51932, state); +} + +extern "C" int +__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 936, state); +} + +extern "C" int +__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 949, state); +} + +extern "C" int +__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 950, state); +} + +/* Convert Windows codepage to a setlocale compatible character set code. + Called from newlib's setlocale() with the current ANSI codepage, if the + charset isn't given explicitely in the POSIX compatible locale specifier. + The function also returns a pointer to the corresponding _mbtowc_r + function. This is used below in the sys_cp_mbstowcs function which + is called directly from fhandler_console if the "Alternate Charset" has + been switched on by an escape sequence. */ +extern "C" mbtowc_p +__set_charset_from_codepage (UINT cp, char *charset) +{ + switch (cp) + { + case 437: + case 720: + case 737: + case 775: + case 850: + case 852: + case 855: + case 857: + case 858: + case 862: + case 866: + case 874: + case 1125: + case 1250: + case 1251: + case 1252: + case 1253: + case 1254: + case 1255: + case 1256: + case 1257: + case 1258: + __small_sprintf (charset, "CP%u", cp); + return __cp_mbtowc; + case 28591: + case 28592: + case 28593: + case 28594: + case 28595: + case 28596: + case 28597: + case 28598: + case 28599: + case 28603: + case 28605: + __small_sprintf (charset, "ISO-8859-%u", cp - 28590); + return __iso_mbtowc; + case 932: + strcpy (charset, "SJIS"); + return __sjis_mbtowc; + case 936: + strcpy (charset, "GBK"); + return __gbk_mbtowc; + case 949: + strcpy (charset, "CP949"); + return __kr_mbtowc; + case 950: + strcpy (charset, "BIG5"); + return __big5_mbtowc; + case 50220: + strcpy (charset, "JIS"); + return __jis_mbtowc; + case 51932: + strcpy (charset, "EUCJP"); + return __eucjp_mbtowc; + case 65001: + strcpy (charset, "UTF-8"); + return __utf8_mbtowc; + default: + break; + } + strcpy (charset, "ASCII"); + return __ascii_mbtowc; +} + +/* Our own sys_wcstombs/sys_mbstowcs functions differ from the + wcstombs/mbstowcs API in three ways: + + - The UNICODE private use area is used in filenames to specify + characters not allowed in Windows filenames ('*', '?', etc). + The sys_wcstombs converts characters in the private use area + back to the corresponding ASCII chars. + + - If a wide character in a filename has no representation in the current + multibyte charset, then usually you wouldn't be able to access the + file. To fix this problem, sys_wcstombs creates a replacement multibyte + sequences for the non-representable wide-char. The sequence starts with + an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the + character. The sys_(cp_)mbstowcs function detects ASCII SO characters + in the input multibyte string and converts the following multibyte + sequence in by treating it as an UTF-8 char. If that fails, the ASCII + SO was probably standalone and it gets just copied over as ASCII SO. + + - The functions always create 0-terminated results, no matter what. + If the result is truncated due to buffer size, it's a bug in Cygwin + and the buffer in the calling function should be raised. */ +size_t __stdcall +sys_wcstombs (char *dst, size_t len, const PWCHAR src, size_t nwc) +{ + char buf[10]; + char *ptr = dst; + wchar_t *pwcs = (wchar_t *) src; + size_t n = 0; + mbstate_t ps; + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t) -1; + while (n < len && nwc-- > 0) + { + wchar_t pw = *pwcs; + /* Convert UNICODE private use area. Reverse functionality (only for + path names) is transform_chars in path.cc. */ + if ((pw & 0xff00) == 0xf000) + pw &= 0xff; + int bytes = _wctomb_r (_REENT, buf, pw, &ps); + /* Convert chars invalid in the current codepage to a sequence + ASCII SO; UTF-8 representation of invalid char. + Do the same for ASCII SO itself. */ + if ((bytes == -1 || pw == 0x0e) && *__locale_charset () != 'U'/*TF-8*/) + { + buf[0] = 0x0e; /* ASCII SO */ + bytes = __utf8_wctomb (_REENT, buf + 1, pw, __locale_charset (), &ps); + if (bytes == -1) + { + ++pwcs; + ps.__count = 0; + continue; + } + ++bytes; /* Add the ASCII SO to the byte count. */ + if (ps.__count == -4) /* First half of a surrogate pair. */ + { + ++pwcs; + if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */ + { + ++pwcs; + ps.__count = 0; + continue; + } + bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, + __locale_charset (), &ps); + } + } + if (n + bytes <= len) + { + n += bytes; + if (dst) + { + for (int i = 0; i < bytes; ++i) + *ptr++ = buf[i]; + } + if (*pwcs++ == 0x00) + break; + } + else + break; + } + if (n && dst) + { + n = (n < len) ? n : len - 1; + dst[n] = '\0'; + } + + return n; +} + /* Allocate a buffer big enough for the string, always including the - terminating '\0'. The buffer pointer is returned in *tgt_p, the return + terminating '\0'. The buffer pointer is returned in *dst_p, the return value is the number of bytes written to the buffer, as usual. The "type" argument determines where the resulting buffer is stored. It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP". @@ -67,57 +378,129 @@ sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen) Note that this code is shared by cygserver (which requires it via __small_vsprintf) and so when built there plain calloc is the only choice. */ -int __stdcall -sys_wcstombs_alloc (char **tgt_p, int type, const PWCHAR src, int slen) +size_t __stdcall +sys_wcstombs_alloc (char **dst_p, int type, const PWCHAR src, size_t nwc) { - int ret; + size_t ret; - ret = WideCharToMultiByte (get_cp (), 0, src, slen, NULL, 0 ,NULL, NULL); - if (ret) + ret = sys_wcstombs (NULL, (size_t) -1, src, nwc); + if (ret > 0) { - size_t tlen = (slen == -1) ? ret : ret + 1; + size_t dlen = ret + 1; if (type == HEAP_NOTHEAP) - *tgt_p = (char *) calloc (tlen, sizeof (char)); + *dst_p = (char *) calloc (dlen, sizeof (char)); else - *tgt_p = (char *) ccalloc ((cygheap_types) type, tlen, sizeof (char)); - if (!*tgt_p) + *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char)); + if (!*dst_p) return 0; - ret = sys_wcstombs (*tgt_p, tlen, src, slen); + ret = sys_wcstombs (*dst_p, dlen, src, nwc); } return ret; } -int __stdcall -sys_mbstowcs (PWCHAR tgt, int tlen, const char *src, int slen) +/* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with + a 0 codepage. If cp is not 0, the codepage is evaluated and used for the + conversion. This is so that fhandler_console can switch to an alternate + charset, which is the charset returned by GetConsoleCP (). Most of the + time this is used for box and line drawing characters. */ +size_t __stdcall +sys_cp_mbstowcs (UINT cp, PWCHAR dst, size_t dlen, const char *src, size_t nms) { - int ret = MultiByteToWideChar (get_cp (), 0, src, slen, tgt, tlen); - if (ret && tgt) + wchar_t *ptr = dst; + char *pmbs = (char *) src; + size_t count = 0; + size_t len = dlen; + int bytes; + mbstate_t ps; + char charsetbuf[32]; + char *charset = __locale_charset (); + mbtowc_p f_mbtowc = __mbtowc; + + if (cp) + f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf); + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t)-1; + while (len > 0) { - ret = (ret < tlen) ? ret : tlen - 1; - tgt[ret] = L'\0'; + /* ASCII SO. Convert following UTF-8 sequence (if not UTF-8 anyway). */ + if (*pmbs == 0x0e && *charset != 'U'/*TF-8*/) + { + pmbs++; + bytes = __utf8_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + if (bytes < 0) + { + /* Invalid UTF-8 sequence? Treat the ASCII SO character as + stand-alone ASCII SO char. */ + bytes = 1; + if (dst) + *ptr = 0x0e; + memset (&ps, 0, sizeof ps); + break; + } + if (bytes == 0) + break; + if (ps.__count == 4) /* First half of a surrogate. */ + { + wchar_t *ptr2 = dst ? ptr + 1 : NULL; + int bytes2 = __utf8_mbtowc (_REENT, ptr2, pmbs + bytes, + nms - bytes, charset, &ps); + if (bytes2 < 0) + break; + pmbs += bytes2; + nms -= bytes2; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + } + else + bytes = f_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + if (bytes > 0) + { + pmbs += bytes; + nms -= bytes; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + else + { + if (bytes == 0) + ++count; + break; + } } - return ret; + + if (count && dst) + { + count = (count < dlen) ? count : dlen - 1; + dst[count] = L'\0'; + } + + return count; } /* Same as sys_wcstombs_alloc, just backwards. */ -int __stdcall -sys_mbstowcs_alloc (PWCHAR *tgt_p, int type, const char *src, int slen) +size_t __stdcall +sys_mbstowcs_alloc (PWCHAR *dst_p, int type, const char *src, size_t nms) { - int ret; + size_t ret; - ret = MultiByteToWideChar (get_cp (), 0, src, slen, NULL, 0); - if (ret) + ret = sys_mbstowcs (NULL, (size_t) -1, src, nms); + if (ret > 0) { - size_t tlen = (slen == -1 ? ret : ret + 1); + size_t dlen = ret + 1; if (type == HEAP_NOTHEAP) - *tgt_p = (PWCHAR) calloc (tlen, sizeof (WCHAR)); + *dst_p = (PWCHAR) calloc (dlen, sizeof (WCHAR)); else - *tgt_p = (PWCHAR) ccalloc ((cygheap_types) type, tlen, sizeof (WCHAR)); - if (!*tgt_p) + *dst_p = (PWCHAR) ccalloc ((cygheap_types) type, dlen, sizeof (WCHAR)); + if (!*dst_p) return 0; - ret = sys_mbstowcs (*tgt_p, tlen, src, slen); + ret = sys_mbstowcs (*dst_p, dlen, src, nms); } return ret; } |