aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2011-04-21 20:00:24 +1000
committerSteve Bennett <steveb@workware.net.au>2011-04-21 20:46:07 +1000
commit9a98f12331c336ad7433f51a2edd8531be2086c6 (patch)
treebcab9a5b126063f726c4fd26bd5a3a22bdc45221
parent827c2764f9a4bebff0947ad1fd3cd5fb1f787c4a (diff)
downloadjimtcl-9a98f12331c336ad7433f51a2edd8531be2086c6.zip
jimtcl-9a98f12331c336ad7433f51a2edd8531be2086c6.tar.gz
jimtcl-9a98f12331c336ad7433f51a2edd8531be2086c6.tar.bz2
More efficient [string trimright] and [string last]
By searching backwards in [string trimright], even in utf-8 strings. And using binary comparison in [string list] Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r--jim.c72
-rw-r--r--utf8.c22
-rw-r--r--utf8.h10
3 files changed, 71 insertions, 33 deletions
diff --git a/jim.c b/jim.c
index 70e9967..a7d7022 100644
--- a/jim.c
+++ b/jim.c
@@ -354,23 +354,11 @@ static int JimStringFirst(const char *s1, int l1, const char *s2, int l2, int id
return -1;
}
+/**
+ * Note: Lengths and return value are in bytes, not chars.
+ */
static int JimStringLast(const char *s1, int l1, const char *s2, int l2)
{
-#ifdef JIM_UTF8
- int i = 0;
- /* It is too hard to search backwards with utf-8, so just keep using JimStringFirst()
- * until we find the last instance
- */
- int result = -1;
- /* Search is inclusive of l2 */
- l2++;
- while ((i = JimStringFirst(s1, l1, s2, l2, i)) >= 0) {
- int c;
- result = i;
- i += utf8_tounicode(s2 + i, &c);
- }
- return result;
-#else
const char *p;
if (!l1 || !l2 || l1 > l2)
@@ -383,9 +371,22 @@ static int JimStringLast(const char *s1, int l1, const char *s2, int l2)
}
}
return -1;
-#endif
}
+#ifdef JIM_UTF8
+/**
+ * Note: Lengths and return value are in chars.
+ */
+static int JimStringLastUtf8(const char *s1, int l1, const char *s2, int l2)
+{
+ int n = JimStringLast(s1, utf8_index(s1, l1), s2, utf8_index(s2, l2));
+ if (n > 0) {
+ n = utf8_strlen(s2, n);
+ }
+ return n;
+}
+#endif
+
int Jim_WideToString(char *buf, jim_wide wideValue)
{
const char *fmt = "%" JIM_WIDE_MODIFIER;
@@ -2408,22 +2409,26 @@ static Jim_Obj *JimStringToUpper(Jim_Interp *interp, Jim_Obj *strObjPtr)
return Jim_NewStringObjNoAlloc(interp, buf, len);
}
-/* Similar to strchr() except searches a UTF-8 string 'str' of byte length 'len'
+/* Similar to memchr() except searches a UTF-8 string 'str' of byte length 'len'
* for unicode character 'c'.
- * Returns 1 if found or 0 if not
+ * Returns the position if found or NULL if not
*/
-static int utf8_strchr(const char *str, int len, int c)
+static const char *utf8_memchr(const char *str, int len, int c)
{
+#ifdef JIM_UTF8
while (len) {
int sc;
int n = utf8_tounicode(str, &sc);
if (sc == c) {
- return 1;
+ return str;
}
str += n;
len -= n;
}
- return 0;
+ return NULL;
+#else
+ return memchr(str, c, len);
+#endif
}
/**
@@ -2439,7 +2444,7 @@ static const char *JimFindTrimLeft(const char *str, int len, const char *trimcha
int c;
int n = utf8_tounicode(str, &c);
- if (utf8_strchr(trimchars, trimlen, c) == 0) {
+ if (utf8_memchr(trimchars, trimlen, c) == NULL) {
/* Not a trim char, so stop */
break;
}
@@ -2458,26 +2463,23 @@ static const char *JimFindTrimLeft(const char *str, int len, const char *trimcha
*/
static const char *JimFindTrimRight(const char *str, int len, const char *trimchars, int trimlen)
{
- /* It is too hard to search backwards with utf-8, so just examine every char
- * of the string and remember the point just after the last non-trim char
- */
- const char *nontrim = NULL;
-
- /* XXX: Could optimize this for non-utf-8 by searching backwards */
+ str += len;
while (len) {
int c;
- int n = utf8_tounicode(str, &c);
+ int n = utf8_prev_len(str, len);
- str += n;
len -= n;
+ str -= n;
+
+ n = utf8_tounicode(str, &c);
- if (utf8_strchr(trimchars, trimlen, c) == 0) {
- nontrim = str;
+ if (utf8_memchr(trimchars, trimlen, c) == NULL) {
+ return str + n;
}
}
- return nontrim;
+ return NULL;
}
static const char default_trim_chars[] = " \t\n\r";
@@ -12536,7 +12538,11 @@ static int Jim_StringCoreCommand(Jim_Interp *interp, int argc, Jim_Obj *const *a
Jim_SetResultInt(interp, JimStringFirst(s1, l1, s2, l2, idx));
}
else {
+#ifdef JIM_UTF8
+ Jim_SetResultInt(interp, JimStringLastUtf8(s1, l1, s2, idx));
+#else
Jim_SetResultInt(interp, JimStringLast(s1, l1, s2, idx));
+#endif
}
return JIM_OK;
}
diff --git a/utf8.c b/utf8.c
index 3be9899..3b32a7f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <assert.h>
#include "utf8.h"
/* This one is always implemented */
@@ -87,6 +88,27 @@ int utf8_charequal(const char *s1, const char *s2)
return c1 == c2;
}
+int utf8_prev_len(const char *str, int len)
+{
+ int n = 1;
+
+ assert(len > 0);
+
+ /* Look up to len chars backward for a start-of-char byte */
+ while (--len) {
+ if ((str[-n] & 0x80) == 0) {
+ /* Start of a 1-byte char */
+ break;
+ }
+ if ((str[-n] & 0xc0) == 0xc0) {
+ /* Start of a multi-byte char */
+ break;
+ }
+ n++;
+ }
+ return n;
+}
+
int utf8_tounicode(const char *str, int *uc)
{
unsigned const char *s = (unsigned const char *)str;
diff --git a/utf8.h b/utf8.h
index 18aebfd..cbce8de 100644
--- a/utf8.h
+++ b/utf8.h
@@ -27,6 +27,7 @@ int utf8_fromunicode(char *p, unsigned short uc);
#define utf8_lower(C) tolower(C)
#define utf8_index(C, I) (I)
#define utf8_charlen(C) 1
+#define utf8_prev_len(S, L) 1
#else
/**
@@ -78,6 +79,15 @@ int utf8_index(const char *str, int charindex);
int utf8_tounicode(const char *str, int *uc);
/**
+ * Returns the number of bytes before 'str' that the previous
+ * utf-8 character sequence starts (which may be the middle of a sequence).
+ *
+ * Looks back at most 'len' bytes backwards, which must be > 0.
+ * If no start char is found, returns -len
+ */
+int utf8_prev_len(const char *str, int len);
+
+/**
* Returns the upper-case variant of the given unicode codepoint.
*
* Does not support unicode code points > \uffff