aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2011-04-21 17:47:53 +1000
committerSteve Bennett <steveb@workware.net.au>2011-04-21 17:47:53 +1000
commit827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a (patch)
treeb014bc7e08d060fad77a4333be08c91896b233fd
parentb9e7448cc5560d4f945d648f099be3219dfb2d81 (diff)
downloadjimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.zip
jimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.tar.gz
jimtcl-827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a.tar.bz2
string trim is now 8 bit clean
i.e. supports embedded nulls Also now supports utf-8 trim chars Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r--jim.c145
-rw-r--r--tests/string.test15
2 files changed, 126 insertions, 34 deletions
diff --git a/jim.c b/jim.c
index f1d2605..70e9967 100644
--- a/jim.c
+++ b/jim.c
@@ -2408,77 +2408,154 @@ static Jim_Obj *JimStringToUpper(Jim_Interp *interp, Jim_Obj *strObjPtr)
return Jim_NewStringObjNoAlloc(interp, buf, len);
}
-static const char *trim_left(const char *str, const char *trimchars)
+/* Similar to strchr() except searches a UTF-8 string 'str' of byte length 'len'
+ * for unicode character 'c'.
+ * Returns 1 if found or 0 if not
+ */
+static int utf8_strchr(const char *str, int len, int c)
{
- return str + strspn(str, trimchars);
+ while (len) {
+ int sc;
+ int n = utf8_tounicode(str, &sc);
+ if (sc == c) {
+ return 1;
+ }
+ str += n;
+ len -= n;
+ }
+ return 0;
}
-/* Note that trim_right() always trims null characters */
-static void trim_right(char *str, const char *trimchars)
+/**
+ * Searches for the first non-trim char in string (str, len)
+ *
+ * If none is found, returns just past the last char.
+ *
+ * Lengths are in bytes.
+ */
+static const char *JimFindTrimLeft(const char *str, int len, const char *trimchars, int trimlen)
{
- char *p = str + strlen(str) - 1;
- char *end = str - 1;
+ while (len) {
+ int c;
+ int n = utf8_tounicode(str, &c);
- while (p != end) {
- if (*p && strchr(trimchars, *p) == NULL) {
+ if (utf8_strchr(trimchars, trimlen, c) == 0) {
+ /* Not a trim char, so stop */
break;
}
- p--;
+ str += n;
+ len -= n;
}
- p[1] = 0;
+ return str;
}
-static const char default_trim_chars[] = " \t\n\r";
-
-static Jim_Obj *JimStringTrim(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr)
+/**
+ * Searches backwards for a non-trim char in string (str, len).
+ *
+ * Returns a pointer to just after the non-trim char, or NULL if not found.
+ *
+ * Lengths are in bytes.
+ */
+static const char *JimFindTrimRight(const char *str, int len, const char *trimchars, int trimlen)
{
- char *buf;
- const char *trimchars = default_trim_chars;
+ /* It is too hard to search backwards with utf-8, so just examine every char
+ * of the string and remember the point just after the last non-trim char
+ */
+ const char *nontrim = NULL;
- if (strObjPtr->typePtr != &stringObjType) {
- SetStringFromAny(interp, strObjPtr);
- }
- if (trimcharsObjPtr) {
- trimchars = Jim_GetString(trimcharsObjPtr, NULL);
- }
+ /* XXX: Could optimize this for non-utf-8 by searching backwards */
- buf = Jim_Alloc(strObjPtr->length + 1);
- strcpy(buf, trim_left(strObjPtr->bytes, trimchars));
- trim_right(buf, trimchars);
+ while (len) {
+ int c;
+ int n = utf8_tounicode(str, &c);
+
+ str += n;
+ len -= n;
- return Jim_NewStringObjNoAlloc(interp, buf, -1);
+ if (utf8_strchr(trimchars, trimlen, c) == 0) {
+ nontrim = str;
+ }
+ }
+
+ return nontrim;
}
+static const char default_trim_chars[] = " \t\n\r";
+/* sizeof() here includes the null byte */
+static int default_trim_chars_len = sizeof(default_trim_chars);
+
static Jim_Obj *JimStringTrimLeft(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr)
{
- const char *str = Jim_GetString(strObjPtr, NULL);
+ int len;
+ const char *str = Jim_GetString(strObjPtr, &len);
const char *trimchars = default_trim_chars;
+ int trimcharslen = default_trim_chars_len;
+ const char *newstr;
if (trimcharsObjPtr) {
- trimchars = Jim_GetString(trimcharsObjPtr, NULL);
+ trimchars = Jim_GetString(trimcharsObjPtr, &trimcharslen);
+ }
+
+ newstr = JimFindTrimLeft(str, len, trimchars, trimcharslen);
+ if (newstr == str) {
+ return strObjPtr;
}
- return Jim_NewStringObj(interp, trim_left(str, trimchars), -1);
+ return Jim_NewStringObj(interp, newstr, len - (newstr - str));
}
static Jim_Obj *JimStringTrimRight(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr)
{
- char *buf;
+ int len;
const char *trimchars = default_trim_chars;
+ int trimcharslen = default_trim_chars_len;
+ const char *nontrim;
if (trimcharsObjPtr) {
- trimchars = Jim_GetString(trimcharsObjPtr, NULL);
+ trimchars = Jim_GetString(trimcharsObjPtr, &trimcharslen);
}
-
if (strObjPtr->typePtr != &stringObjType) {
SetStringFromAny(interp, strObjPtr);
}
+ Jim_GetString(strObjPtr, &len);
+ nontrim = JimFindTrimRight(strObjPtr->bytes, len, trimchars, trimcharslen);
+
+ if (nontrim == NULL) {
+ /* All trim, so return a zero-length string */
+ return Jim_NewEmptyStringObj(interp);
+ }
+ if (nontrim == strObjPtr->bytes + len) {
+ return strObjPtr;
+ }
- buf = Jim_StrDup(strObjPtr->bytes);
- trim_right(buf, trimchars);
+ if (Jim_IsShared(strObjPtr)) {
+ strObjPtr = Jim_NewStringObj(interp, strObjPtr->bytes, (nontrim - strObjPtr->bytes));
+ }
+ else {
+ /* Can modify this string in place */
+ strObjPtr->bytes[nontrim - strObjPtr->bytes] = 0;
+ strObjPtr->length = (nontrim - strObjPtr->bytes);
+ }
+
+ return strObjPtr;
+}
+
+static Jim_Obj *JimStringTrim(Jim_Interp *interp, Jim_Obj *strObjPtr, Jim_Obj *trimcharsObjPtr)
+{
+ /* First trim left. */
+ Jim_Obj *objPtr = JimStringTrimLeft(interp, strObjPtr, trimcharsObjPtr);
+
+ /* Now trim right */
+ strObjPtr = JimStringTrimRight(interp, objPtr, trimcharsObjPtr);
+
+ if (objPtr != strObjPtr) {
+ /* Note that we don't want this object to be leaked */
+ Jim_IncrRefCount(objPtr);
+ Jim_DecrRefCount(interp, objPtr);
+ }
- return Jim_NewStringObjNoAlloc(interp, buf, -1);
+ return strObjPtr;
}
diff --git a/tests/string.test b/tests/string.test
index c2d5f10..6a70b79 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -809,4 +809,19 @@ test string-20.5 {string trimright} {
string trimright ""
} {}
+# Test for 8-bit clean and utf-8 trim chars
+test string-21.1 {string trim embedded nulls} {
+ string trim " abc\x00def "
+} "abc\x00def"
+test string-21.2 {string trimleft embedded nulls} {
+ string trimleft " abc\x00def "
+} "abc\x00def "
+test string-21.3 {string trimright embedded nulls} {
+ string trimright " abc\x00def "
+} " abc\x00def"
+test string-21.4 {string trim utf-8} {
+ string trim "\u00b5\u00b6abc\x00def\u00b5\u00b5" "\u00b5\u00b6"
+} "abc\x00def"
+
+
testreport