diff options
author | Steve Bennett <steveb@workware.net.au> | 2020-05-04 15:50:07 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2020-05-04 21:57:37 +1000 |
commit | a8a65e7637a4fc82777f2b0d822c902f333cd779 (patch) | |
tree | a926fe0bc6164fb0a7d7b1d3191126f51572957d | |
parent | a942eef12415e0b53e04279259dd92cca4d444f0 (diff) | |
download | jimtcl-a8a65e7637a4fc82777f2b0d822c902f333cd779.zip jimtcl-a8a65e7637a4fc82777f2b0d822c902f333cd779.tar.gz jimtcl-a8a65e7637a4fc82777f2b0d822c902f333cd779.tar.bz2 |
scan: Fix a utf-8 bug for string length
The string length was being checked in chars instead of bytes
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r-- | jim.c | 25 | ||||
-rw-r--r-- | tests/utf8.test | 7 |
2 files changed, 20 insertions, 12 deletions
@@ -9690,7 +9690,7 @@ static Jim_Obj *JimScanAString(Jim_Interp *interp, const char *sdescr, const cha * returned of -1 in case of no conversion tool place and string was * already scanned thru */ -static int ScanOneEntry(Jim_Interp *interp, const char *str, int pos, int strLen, +static int ScanOneEntry(Jim_Interp *interp, const char *str, int pos, int str_bytelen, ScanFmtStringObj * fmtObj, long idx, Jim_Obj **valObjPtr) { const char *tok; @@ -9705,17 +9705,17 @@ static int ScanOneEntry(Jim_Interp *interp, const char *str, int pos, int strLen if (descr->prefix) { /* There was a prefix given before the conversion, skip it and adjust * the string-to-be-parsed accordingly */ - for (i = 0; pos < strLen && descr->prefix[i]; ++i) { + for (i = 0; pos < str_bytelen && descr->prefix[i]; ++i) { /* If prefix require, skip WS */ if (isspace(UCHAR(descr->prefix[i]))) - while (pos < strLen && isspace(UCHAR(str[pos]))) + while (pos < str_bytelen && isspace(UCHAR(str[pos]))) ++pos; else if (descr->prefix[i] != str[pos]) break; /* Prefix do not match here, leave the loop */ else ++pos; /* Prefix matched so far, next round */ } - if (pos >= strLen) { + if (pos >= str_bytelen) { return -1; /* All of str consumed: EOF condition */ } else if (descr->prefix[i] != 0) @@ -9725,6 +9725,7 @@ static int ScanOneEntry(Jim_Interp *interp, const char *str, int pos, int strLen if (descr->type != 'c' && descr->type != '[' && descr->type != 'n') while (isspace(UCHAR(str[pos]))) ++pos; + /* Determine how much skipped/scanned so far */ scanned = pos - anchor; @@ -9733,22 +9734,22 @@ static int ScanOneEntry(Jim_Interp *interp, const char *str, int pos, int strLen /* Return pseudo conversion means: how much scanned so far? */ *valObjPtr = Jim_NewIntObj(interp, anchor + scanned); } - else if (pos >= strLen) { + else if (pos >= str_bytelen) { /* Cannot scan anything, as str is totally consumed */ return -1; } else if (descr->type == 'c') { - int c; - scanned += utf8_tounicode(&str[pos], &c); - *valObjPtr = Jim_NewIntObj(interp, c); - return scanned; + int c; + scanned += utf8_tounicode(&str[pos], &c); + *valObjPtr = Jim_NewIntObj(interp, c); + return scanned; } else { /* Processing of conversions follows ... */ if (descr->width > 0) { /* Do not try to scan as fas as possible but only the given width. * To ensure this, we copy the part that should be scanned. */ - size_t sLen = utf8_strlen(&str[pos], strLen - pos); + size_t sLen = utf8_strlen(&str[pos], str_bytelen - pos); size_t tLen = descr->width > sLen ? sLen : descr->width; tmpObj = Jim_NewStringObjUtf8(interp, str + pos, tLen); @@ -9838,7 +9839,7 @@ Jim_Obj *Jim_ScanString(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *fmtObjP size_t i, pos; int scanned = 1; const char *str = Jim_String(strObjPtr); - int strLen = Jim_Utf8Length(interp, strObjPtr); + int str_bytelen = Jim_Length(strObjPtr); Jim_Obj *resultList = 0; Jim_Obj **resultVec = 0; int resultc; @@ -9875,7 +9876,7 @@ Jim_Obj *Jim_ScanString(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *fmtObjP continue; /* As long as any conversion could be done, we will proceed */ if (scanned > 0) - scanned = ScanOneEntry(interp, str, pos, strLen, fmtObj, i, &value); + scanned = ScanOneEntry(interp, str, pos, str_bytelen, fmtObj, i, &value); /* In case our first try results in EOF, we will leave */ if (scanned == -1 && i == 0) goto eof; diff --git a/tests/utf8.test b/tests/utf8.test index 7b655da..5178d8b 100644 --- a/tests/utf8.test +++ b/tests/utf8.test @@ -169,4 +169,11 @@ test utf8-9.3 {string totitle} { string totitle abc-\u01c4 } "Abc-\u01c6" +# Previously scan was using char length instead of byte length +# when iterating over the string +test utf8-10.1 {scan with utf-8} { + scan ab\u0300c %c%c%c%c a b c d + list $a $b $c $d +} {97 98 768 99} + testreport |