diff options
author | Steve Bennett <steveb@workware.net.au> | 2011-04-21 17:47:53 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2011-04-21 17:47:53 +1000 |
commit | 827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a (patch) | |
tree | b014bc7e08d060fad77a4333be08c91896b233fd | |
parent | b9e7448cc5560d4f945d648f099be3219dfb2d81 (diff) | |
download | jimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.zip jimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.tar.gz jimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.tar.bz2 |
string trim is now 8 bit clean
i.e. supports embedded nulls
Also now supports utf-8 trim chars
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r-- | jim.c | 145 | ||||
-rw-r--r-- | tests/string.test | 15 |
2 files changed, 126 insertions, 34 deletions
@@ -2408,77 +2408,154 @@ static Jim_Obj *JimStringToUpper(Jim_Interp *interp, Jim_Obj *strObjPtr) return Jim_NewStringObjNoAlloc(interp, buf, len); } -static const char *trim_left(const char *str, const char *trimchars) +/* Similar to strchr() except searches a UTF-8 string 'str' of byte length 'len' + * for unicode character 'c'. + * Returns 1 if found or 0 if not + */ +static int utf8_strchr(const char *str, int len, int c) { - return str + strspn(str, trimchars); + while (len) { + int sc; + int n = utf8_tounicode(str, &sc); + if (sc == c) { + return 1; + } + str += n; + len -= n; + } + return 0; } -/* Note that trim_right() always trims null characters */ -static void trim_right(char *str, const char *trimchars) +/** + * Searches for the first non-trim char in string (str, len) + * + * If none is found, returns just past the last char. + * + * Lengths are in bytes. + */ +static const char *JimFindTrimLeft(const char *str, int len, const char *trimchars, int trimlen) { - char *p = str + strlen(str) - 1; - char *end = str - 1; + while (len) { + int c; + int n = utf8_tounicode(str, &c); - while (p != end) { - if (*p && strchr(trimchars, *p) == NULL) { + if (utf8_strchr(trimchars, trimlen, c) == 0) { + /* Not a trim char, so stop */ break; } - p--; + str += n; + len -= n; } - p[1] = 0; + return str; } -static const char default_trim_chars[] = " \t\n\r"; - -static Jim_Obj *JimStringTrim(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr) +/** + * Searches backwards for a non-trim char in string (str, len). + * + * Returns a pointer to just after the non-trim char, or NULL if not found. + * + * Lengths are in bytes. + */ +static const char *JimFindTrimRight(const char *str, int len, const char *trimchars, int trimlen) { - char *buf; - const char *trimchars = default_trim_chars; + /* It is too hard to search backwards with utf-8, so just examine every char + * of the string and remember the point just after the last non-trim char + */ + const char *nontrim = NULL; - if (strObjPtr->typePtr != &stringObjType) { - SetStringFromAny(interp, strObjPtr); - } - if (trimcharsObjPtr) { - trimchars = Jim_GetString(trimcharsObjPtr, NULL); - } + /* XXX: Could optimize this for non-utf-8 by searching backwards */ - buf = Jim_Alloc(strObjPtr->length + 1); - strcpy(buf, trim_left(strObjPtr->bytes, trimchars)); - trim_right(buf, trimchars); + while (len) { + int c; + int n = utf8_tounicode(str, &c); + + str += n; + len -= n; - return Jim_NewStringObjNoAlloc(interp, buf, -1); + if (utf8_strchr(trimchars, trimlen, c) == 0) { + nontrim = str; + } + } + + return nontrim; } +static const char default_trim_chars[] = " \t\n\r"; +/* sizeof() here includes the null byte */ +static int default_trim_chars_len = sizeof(default_trim_chars); + static Jim_Obj *JimStringTrimLeft(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr) { - const char *str = Jim_GetString(strObjPtr, NULL); + int len; + const char *str = Jim_GetString(strObjPtr, &len); const char *trimchars = default_trim_chars; + int trimcharslen = default_trim_chars_len; + const char *newstr; if (trimcharsObjPtr) { - trimchars = Jim_GetString(trimcharsObjPtr, NULL); + trimchars = Jim_GetString(trimcharsObjPtr, &trimcharslen); + } + + newstr = JimFindTrimLeft(str, len, trimchars, trimcharslen); + if (newstr == str) { + return strObjPtr; } - return Jim_NewStringObj(interp, trim_left(str, trimchars), -1); + return Jim_NewStringObj(interp, newstr, len - (newstr - str)); } static Jim_Obj *JimStringTrimRight(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr) { - char *buf; + int len; const char *trimchars = default_trim_chars; + int trimcharslen = default_trim_chars_len; + const char *nontrim; if (trimcharsObjPtr) { - trimchars = Jim_GetString(trimcharsObjPtr, NULL); + trimchars = Jim_GetString(trimcharsObjPtr, &trimcharslen); } - if (strObjPtr->typePtr != &stringObjType) { SetStringFromAny(interp, strObjPtr); } + Jim_GetString(strObjPtr, &len); + nontrim = JimFindTrimRight(strObjPtr->bytes, len, trimchars, trimcharslen); + + if (nontrim == NULL) { + /* All trim, so return a zero-length string */ + return Jim_NewEmptyStringObj(interp); + } + if (nontrim == strObjPtr->bytes + len) { + return strObjPtr; + } - buf = Jim_StrDup(strObjPtr->bytes); - trim_right(buf, trimchars); + if (Jim_IsShared(strObjPtr)) { + strObjPtr = Jim_NewStringObj(interp, strObjPtr->bytes, (nontrim - strObjPtr->bytes)); + } + else { + /* Can modify this string in place */ + strObjPtr->bytes[nontrim - strObjPtr->bytes] = 0; + strObjPtr->length = (nontrim - strObjPtr->bytes); + } + + return strObjPtr; +} + +static Jim_Obj *JimStringTrim(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr) +{ + /* First trim left. */ + Jim_Obj *objPtr = JimStringTrimLeft(interp, strObjPtr, trimcharsObjPtr); + + /* Now trim right */ + strObjPtr = JimStringTrimRight(interp, objPtr, trimcharsObjPtr); + + if (objPtr != strObjPtr) { + /* Note that we don't want this object to be leaked */ + Jim_IncrRefCount(objPtr); + Jim_DecrRefCount(interp, objPtr); + } - return Jim_NewStringObjNoAlloc(interp, buf, -1); + return strObjPtr; } diff --git a/tests/string.test b/tests/string.test index c2d5f10..6a70b79 100644 --- a/tests/string.test +++ b/tests/string.test @@ -809,4 +809,19 @@ test string-20.5 {string trimright} { string trimright "" } {} +# Test for 8-bit clean and utf-8 trim chars +test string-21.1 {string trim embedded nulls} { + string trim " abc\x00def " +} "abc\x00def" +test string-21.2 {string trimleft embedded nulls} { + string trimleft " abc\x00def " +} "abc\x00def " +test string-21.3 {string trimright embedded nulls} { + string trimright " abc\x00def " +} " abc\x00def" +test string-21.4 {string trim utf-8} { + string trim "\u00b5\u00b6abc\x00def\u00b5\u00b5" "\u00b5\u00b6" +} "abc\x00def" + + testreport |