diff options
author | Christopher Faylor <me@cgf.cx> | 2009-05-31 03:59:38 +0000 |
---|---|---|
committer | Christopher Faylor <me@cgf.cx> | 2009-05-31 03:59:38 +0000 |
commit | e44b9069cd227e8566888b85ec9fd2163d935e71 (patch) | |
tree | 50f098f82e3be4f97287f3958704dfdef44d3549 /winsup/cygwin | |
parent | 025e4859affbadde459fb2bf42ad0946e5009d4c (diff) | |
download | newlib-e44b9069cd227e8566888b85ec9fd2163d935e71.zip newlib-e44b9069cd227e8566888b85ec9fd2163d935e71.tar.gz newlib-e44b9069cd227e8566888b85ec9fd2163d935e71.tar.bz2 |
* strfuncs.cc (sys_cp_mbstowcs): Treat src as unsigned char *. Convert failure
of f_mbtowc into a single malformed utf-16 value.
Diffstat (limited to 'winsup/cygwin')
-rw-r--r-- | winsup/cygwin/ChangeLog | 5 | ||||
-rw-r--r-- | winsup/cygwin/strfuncs.cc | 32 |
2 files changed, 30 insertions, 7 deletions
diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index e64651f..b7ea755 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,5 +1,10 @@ 2009-05-30 Christopher Faylor <me+cygwin@cgf.cx> + * strfuncs.cc (sys_cp_mbstowcs): Treat src as unsigned char *. Convert + failure of f_mbtowc into a single malformed utf-16 value. + +2009-05-30 Christopher Faylor <me+cygwin@cgf.cx> + * cygwin/include/sys/termios.h: Make default erase character "^?". 2009-05-30 Christopher Faylor <me+cygwin@cgf.cx> diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index 965bdee..4563f98 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -523,7 +523,7 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, const char *src, size_t nms) { wchar_t *ptr = dst; - char *pmbs = (char *) src; + unsigned const char *pmbs = (unsigned const char *) src; size_t count = 0; size_t len = dlen; int bytes; @@ -539,12 +539,13 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, UTF-8 sequence, there must be at least two more bytes left, and the next byte must be a valid UTF-8 start byte. If the charset isn't UTF-8 anyway, try to convert the following bytes as UTF-8 sequence. */ - if (*pmbs == 0x0e && nms > 2 && *(unsigned char *) (pmbs + 1) >= 0xc2 - && *(unsigned char *) (pmbs + 1) <= 0xf4 && *charset != 'U'/*TF-8*/) + if (*pmbs == 0x0e && nms > 2 && pmbs[1] >= 0xc2 + && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/) { pmbs++; --nms; - bytes = __utf8_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs, nms, + charset, &ps); if (bytes < 0) { /* Invalid UTF-8 sequence? Treat the ASCII SO character as @@ -560,7 +561,7 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, if (ps.__count == 4) /* First half of a surrogate. */ { wchar_t *ptr2 = dst ? ptr + 1 : NULL; - int bytes2 = __utf8_mbtowc (_REENT, ptr2, pmbs + bytes, + int bytes2 = __utf8_mbtowc (_REENT, ptr2, (const char *) pmbs + bytes, nms - bytes, charset, &ps); if (bytes2 < 0) break; @@ -571,8 +572,25 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, --len; } } - else - bytes = f_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms, charset, &ps)) < 0 && *pmbs > '\x80') + { + /* This should probably be handled in f_mbtowc which can operate + on sequences rather than individual characters. + The technique is based on a discussion here: + + http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html + + This is hardly perfect. Windows doesn't do anything sensical with + characters converted to this format and (currently) we don't convert + them back into their original single byte form. It does allow + processing of src to continue, however, which, since there is no + way to signal decoding errors, seems like the best we can do. + + */ + *ptr = L'\xdc80' | *pmbs; + bytes = 1; + } + if (bytes > 0) { pmbs += bytes; |