aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xconfigure2
-rwxr-xr-xconfigure.ac3
-rw-r--r--jimregexp.c872
-rw-r--r--jimregexp.h14
-rw-r--r--jimsh.c3
-rw-r--r--tests/Makefile2
-rw-r--r--tests/regcount.test35
-rw-r--r--tests/regexp2.test909
-rw-r--r--tests/utf8.test207
-rw-r--r--tests/utftcl.test290
10 files changed, 1871 insertions, 466 deletions
diff --git a/configure b/configure
index 17ce442..a0ce540 100755
--- a/configure
+++ b/configure
@@ -3645,7 +3645,7 @@ fi
if test "${enable_utf8+set}" = set; then :
enableval=$enable_utf8;
if test "x$enableval" = "xyes" ; then
- EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8"
+ EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8 -DJIM_REGEXP"
fi
diff --git a/configure.ac b/configure.ac
index 7913dce..5c3a27a 100755
--- a/configure.ac
+++ b/configure.ac
@@ -78,7 +78,8 @@ AC_ARG_ENABLE(utf8,
[ --enable-utf8 include support for utf8-encoded strings],
[
if test "x$enableval" = "xyes" ; then
- EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8"
+ dnl Note that utf-8 support requires the built-in regexp
+ EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8 -DJIM_REGEXP"
fi
]
)
diff --git a/jimregexp.c b/jimregexp.c
index 90c8dd7..d6a8723 100644
--- a/jimregexp.c
+++ b/jimregexp.c
@@ -40,6 +40,9 @@
*** seiwald@perforce.com, on 05 November 2002, to const string literals.
*** THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
*** on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
+ *** This includes counted repetitions, UTF-8 support, character classes,
+ *** shorthand character classes, increased number of parentheses to 100,
+ *** backslash escape sequences.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
@@ -52,6 +55,7 @@
#include "jim.h"
#include "jimregexp.h"
+#include "utf8.h"
#if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
@@ -98,7 +102,7 @@
* The first byte of the regexp internal "program" is actually this magic
* number; the start node begins in the second byte.
*/
-#define REG_MAGIC 0234
+#define REG_MAGIC 0xFADED00D
/*
* Opcode notes:
@@ -132,9 +136,9 @@
* Using two bytes for the "next" pointer is vast overkill for most things,
* but allows patterns to get big without disasters.
*/
-#define OP(p) ((unsigned char)*(p))
-#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
-#define OPERAND(p) ((p) + 3)
+#define OP(p) ((p)[0])
+#define NEXT(p) ((p)[1])
+#define OPERAND(p) ((p) + 2)
/*
* See regmagic.h for one further detail of program structure.
@@ -144,14 +148,11 @@
/*
* Utility definitions.
*/
-#ifndef CHARBITS
-#define UCHARAT(p) ((int)*(unsigned char *)(p))
-#else
-#define UCHARAT(p) ((int)*(p)&CHARBITS)
-#endif
+//#define UCHARAT(p) (*(p))
#define FAIL(R,M) { (R)->err = (M); return (M); }
-#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
+#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
+#define META "^$.[()|?{+*"
/*
* Flags to be passed up and down.
@@ -164,28 +165,42 @@
/*
* Forward declarations for regcomp()'s friends.
*/
-static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
-static char *regpiece(regex_t *preg, int *flagp );
-static char *regbranch(regex_t *preg, int *flagp );
-static char *regatom(regex_t *preg, int *flagp );
-static char *regnode(regex_t *preg, int op );
-static const char *regnext(regex_t *preg, const char *p );
+static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
+static int *regpiece(regex_t *preg, int *flagp );
+static int *regbranch(regex_t *preg, int *flagp );
+static int *regatom(regex_t *preg, int *flagp );
+static int *regnode(regex_t *preg, int op );
+static const int *regnext(regex_t *preg, const int *p );
static void regc(regex_t *preg, int b );
-static void reginsert(regex_t *preg, char op, char *opnd );
-static void regtail(regex_t *preg, char *p, const char *val );
-static void regoptail(regex_t *preg, char *p, const char *val );
+static int *reginsert(regex_t *preg, int op, int *opnd );
+static void regtail(regex_t *preg, int *p, const int *val );
+static void regoptail(regex_t *preg, int *p, const int *val );
-static const char *str_find(const char *string, char c, int nocase);
+static int reg_range_find(const int *string, int c, int nocase);
+static const char *str_find(const char *string, int c, int nocase);
+static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
/*#define DEBUG*/
#ifdef DEBUG
int regnarrate = 0;
static void regdump(regex_t *preg);
-static const char *regprop( const char *op );
+static const char *regprop( const int *op );
#endif
-static char regdummy;
+static int regdummy;
+
+/**
+ * Returns the length of the null-terminated integer sequence.
+ */
+static int str_int_len(const int *seq)
+{
+ int n = 0;
+ while (*seq++) {
+ n++;
+ }
+ return n;
+}
/*
- regcomp - compile a regular expression into internal code
@@ -204,8 +219,8 @@ static char regdummy;
*/
int regcomp(regex_t *preg, const char *exp, int cflags)
{
- const char *scan;
- const char *longest;
+ const int *scan;
+ const int *longest;
unsigned len;
int flags;
@@ -215,12 +230,9 @@ int regcomp(regex_t *preg, const char *exp, int cflags)
FAIL(preg, REG_ERR_NULL_ARGUMENT);
/* First pass: determine size, legality. */
-#ifdef notdef
- if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
-#endif
preg->cflags = cflags;
- preg->regparse = (char *)exp;
- preg->re_nsub = 1;
+ preg->regparse = exp;
+ preg->re_nsub = 0;
preg->regsize = 0L;
preg->regcode = &regdummy;
regc(preg, REG_MAGIC);
@@ -232,20 +244,21 @@ int regcomp(regex_t *preg, const char *exp, int cflags)
FAIL(preg,REG_ERR_TOO_BIG);
/* Allocate space. */
- preg->program = malloc(preg->regsize);
+ preg->program = malloc(preg->regsize * sizeof(*preg->program));
if (preg->program == NULL)
FAIL(preg, REG_ERR_NOMEM);
/* Second pass: emit code. */
- preg->regparse = (char *)exp;
- preg->re_nsub = 1;
+ preg->regparse = exp;
+ preg->re_nsub = 0;
+ preg->regsize = 0L;
preg->regcode = preg->program;
regc(preg, REG_MAGIC);
if (reg(preg, 0, &flags) == NULL)
return preg->err;
/* Dig out information for optimizations. */
- preg->regstart = '\0'; /* Worst-case defaults. */
+ preg->regstart = 0; /* Worst-case defaults. */
preg->reganch = 0;
preg->regmust = NULL;
preg->regmlen = 0;
@@ -270,18 +283,22 @@ int regcomp(regex_t *preg, const char *exp, int cflags)
if (flags&SPSTART) {
longest = NULL;
len = 0;
- for (; scan != NULL; scan = regnext(preg, scan))
- if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
- longest = OPERAND(scan);
- len = strlen(OPERAND(scan));
+ for (; scan != NULL; scan = regnext(preg, scan)) {
+ if (OP(scan) == EXACTLY) {
+ int plen = str_int_len(OPERAND(scan));
+ if (plen >= len) {
+ longest = OPERAND(scan);
+ len = plen;
+ }
}
+ }
preg->regmust = longest;
preg->regmlen = len;
}
}
#ifdef DEBUG
- /*regdump(preg);*/
+ regdump(preg);
#endif
return 0;
@@ -296,11 +313,11 @@ int regcomp(regex_t *preg, const char *exp, int cflags)
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
*/
-static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
+static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
{
- char *ret;
- char *br;
- const char *ender;
+ int *ret;
+ int *br;
+ const int *ender;
int parno = 0;
int flags;
@@ -308,7 +325,7 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
/* Make an OPEN node, if parenthesized. */
if (paren) {
- parno = preg->re_nsub++;
+ parno = ++preg->re_nsub;
ret = regnode(preg, OPEN+parno);
} else
ret = NULL;
@@ -340,7 +357,7 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
regtail(preg, ret, ender);
/* Hook the tails of the branches to the closing node. */
- for (br = ret; br != NULL; br = (char *)regnext(preg, br))
+ for (br = ret; br != NULL; br = (int *)regnext(preg, br))
regoptail(preg, br, ender);
/* Check for proper termination. */
@@ -365,11 +382,11 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
*
* Implements the concatenation operator.
*/
-static char *regbranch(regex_t *preg, int *flagp )
+static int *regbranch(regex_t *preg, int *flagp )
{
- char *ret;
- char *chain;
- char *latest;
+ int *ret;
+ int *chain;
+ int *latest;
int flags;
*flagp = WORST; /* Tentatively. */
@@ -382,10 +399,12 @@ static char *regbranch(regex_t *preg, int *flagp )
if (latest == NULL)
return(NULL);
*flagp |= flags&HASWIDTH;
- if (chain == NULL) /* First piece. */
+ if (chain == NULL) {/* First piece. */
*flagp |= flags&SPSTART;
- else
+ }
+ else {
regtail(preg, chain, latest);
+ }
chain = latest;
}
if (chain == NULL) /* Loop ran zero times. */
@@ -394,6 +413,31 @@ static char *regbranch(regex_t *preg, int *flagp )
return(ret);
}
+/**
+ * Duplicates the program at 'pos' of length 'len' at the end of the program.
+ *
+ * If 'maketail' is set, the next point for 'pos' is set to skip to the next
+ * part of the program after 'pos'.
+ */
+static int *regdup(regex_t *preg, int *pos, int len, int maketail)
+{
+ int i;
+
+ preg->regsize += len;
+
+ if (preg->regcode == &regdummy) {
+ return pos;
+ }
+
+ for (i = 0; i < len; i++) {
+ regc(preg, pos[i]);
+ }
+ if (maketail) {
+ regtail(preg, pos, pos + len);
+ }
+ return preg->regcode - len;
+}
+
/*
- regpiece - something followed by possible [*+?]
*
@@ -403,17 +447,21 @@ static char *regbranch(regex_t *preg, int *flagp )
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
*/
-static char *regpiece(regex_t *preg, int *flagp )
+static int *regpiece(regex_t *preg, int *flagp)
{
- char *ret;
+ int *ret;
char op;
- char *next;
+ int *next;
int flags;
+ int size = preg->regsize;
+ int *chain = NULL;
ret = regatom(preg, &flags);
if (ret == NULL)
return(NULL);
+ size = preg->regsize - size;
+
op = *preg->regparse;
if (!ISMULT(op)) {
*flagp = flags;
@@ -426,6 +474,103 @@ static char *regpiece(regex_t *preg, int *flagp )
}
*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
+ /* Handle braces (counted repetition) by expansion */
+ if (op == '{') {
+ int min = 0;
+ int max = 0;
+ char *end;
+
+ min = strtoul(preg->regparse + 1, &end, 10);
+ if (end == preg->regparse + 1) {
+ if (*end == ',') {
+ min = 0;
+ }
+ else {
+ preg->err = REG_ERR_BAD_COUNT;
+ return NULL;
+ }
+ }
+ preg->regparse = end;
+ max = strtoul(preg->regparse + 1, &end, 10);
+ if (*end != '}') {
+ preg->err = REG_ERR_UNMATCHED_BRACES;
+ return NULL;
+ }
+ if (end == preg->regparse + 1) {
+ max = -1;
+ }
+ else if (max < min || max >= 100) {
+ preg->err = REG_ERR_BAD_COUNT;
+ return NULL;
+ }
+ if (min >= 100) {
+ preg->err = REG_ERR_BAD_COUNT;
+ return NULL;
+ }
+
+ preg->regparse = strchr(preg->regparse, '}');
+
+ /* By default, chain to the start of the sequence */
+ chain = ret;
+
+ if (max < 0 || max == min) {
+ /* Simple case */
+ if (max == min) {
+ if (min == 0) {
+ /* {0,0} so do nothing at all */
+ reginsert(preg, NOTHING, ret);
+ preg->regparse++;
+ return ret;
+ }
+ /* Output 'min - 1' instances of 'x' */
+ min--;
+ op = 0;
+ }
+ else {
+ /* {n,} is just xxxx* */
+ op = '*';
+ /* No - chain to the tail of the sequence */
+ chain = NULL;
+ }
+
+ /* We need to duplicate the arg 'min' times */
+ while (min--) {
+ ret = regdup(preg, ret, size, 1);
+ }
+ }
+ else {
+ /* Complex case */
+ int i;
+
+ /* Chaining is needed */
+
+ /* Need to emit some min args first */
+ for (i = 0; i < min; i++) {
+ ret = regdup(preg, ret, size, 1);
+ }
+
+ for (i = min; i < max; i++) {
+ /* Emit x */
+ /* There is already one instance of 'reg' at the end */
+ /* Add another 'reg' at the end */
+ int *prog;
+
+ /* Convert to (x|), just like ? */
+ prog = reginsert(preg, BRANCH, ret); /* Either x */
+ regtail(preg, ret, regnode(preg, BRANCH)); /* or */
+ next = regnode(preg, NOTHING); /* null. */
+ regtail(preg, ret, next);
+ regoptail(preg, ret, next);
+
+ /* Now grab a copy ready for the next iteration */
+ if (i != max - 1) {
+ ret = regdup(preg, prog, size, 0);
+ }
+ }
+ op = 0;
+ }
+ }
+
if (op == '*' && (flags&SIMPLE))
reginsert(preg, STAR, ret);
else if (op == '*') {
@@ -458,24 +603,127 @@ static char *regpiece(regex_t *preg, int *flagp )
return NULL;
}
- return(ret);
+ return chain ? chain : ret;
}
+/**
+ * Add all characters in the inclusive range between lower and upper.
+ *
+ * Handles a swapped range (upper < lower).
+ */
static void reg_addrange(regex_t *preg, int lower, int upper)
{
if (lower > upper) {
reg_addrange(preg, upper, lower);
}
- while (lower <= upper) {
- regc(preg, lower++);
- }
+ /* Add a range as length, start */
+ regc(preg, upper - lower + 1);
+ regc(preg, lower);
}
-static void reg_addstr(regex_t *preg, const char *str)
+/**
+ * Add a null-terminated literal string as a set of ranges.
+ */
+static void reg_addrange_str(regex_t *preg, const char *str)
{
while (*str) {
- regc(preg, *str++);
+ reg_addrange(preg, *str, *str);
+ str++;
+ }
+}
+
+/**
+ * Extracts the next unicode char from utf8.
+ *
+ * If 'upper' is set, converts the char to uppercase.
+ */
+static int utf8_tounicode_case(const char *s, int *uc, int upper)
+{
+ int l = utf8_tounicode(s, uc);
+ if (upper) {
+ *uc = utf8_upper(*uc);
+ }
+ return l;
+}
+
+/**
+ * Converts a hex digit to decimal.
+ *
+ * Returns -1 for an invalid hex digit.
+ */
+static int xdigitval(int c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ return -1;
+}
+
+/**
+ * Parses up to 'n' hex digits at 's' and stores the result in *uc.
+ *
+ * Returns the number of hex digits parsed.
+ * If there are no hex digits, returns 0 and stores nothing.
+ */
+static int parse_hex(const char *s, int n, int *uc)
+{
+ int val = 0;
+ int k;
+
+ for (k = 0; k < n; k++) {
+ int c = xdigitval(*s++);
+ if (c == -1) {
+ break;
+ }
+ val = (val << 4) | c;
+ }
+ if (k) {
+ *uc = val;
}
+ return k;
+}
+
+/**
+ * Call for chars after a backlash to decode the escape sequence.
+ *
+ * Stores the result in *ch.
+ *
+ * Returns the number of bytes consumed.
+ */
+static int reg_decode_escape(const char *s, int *ch)
+{
+ int n;
+ const char *s0 = s;
+
+ *ch = *s++;
+
+ switch (*ch) {
+ case 'b': *ch = '\b'; break;
+ case 'e': *ch = 27; break;
+ case 'f': *ch = '\f'; break;
+ case 'n': *ch = '\n'; break;
+ case 'r': *ch = '\r'; break;
+ case 't': *ch = '\t'; break;
+ case 'v': *ch = '\v'; break;
+ case 'u':
+ if ((n = parse_hex(s, 4, ch)) > 0) {
+ s += n;
+ }
+ break;
+ case 'x':
+ if ((n = parse_hex(s, 2, ch)) > 0) {
+ s += n;
+ }
+ break;
+ case '\0':
+ s--;
+ *ch = '\\';
+ break;
+ }
+ return s - s0;
}
/*
@@ -486,14 +734,19 @@ static void reg_addstr(regex_t *preg, const char *str)
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*/
-static char *regatom(regex_t *preg, int *flagp )
+static int *regatom(regex_t *preg, int *flagp)
{
- char *ret;
+ int *ret;
int flags;
+ int nocase = (preg->cflags & REG_ICASE);
+
+ int ch;
+ int n = utf8_tounicode_case(preg->regparse, &ch, nocase);
*flagp = WORST; /* Tentatively. */
- switch (*preg->regparse++) {
+ preg->regparse += n;
+ switch (ch) {
/* FIXME: these chars only have meaning at beg/end of pat? */
case '^':
ret = regnode(preg, BOL);
@@ -506,41 +759,82 @@ static char *regatom(regex_t *preg, int *flagp )
*flagp |= HASWIDTH|SIMPLE;
break;
case '[': {
- if (*preg->regparse == '^') { /* Complement of range. */
+ const char *pattern = preg->regparse;
+
+ if (*pattern == '^') { /* Complement of range. */
ret = regnode(preg, ANYBUT);
- preg->regparse++;
+ pattern++;
} else
ret = regnode(preg, ANYOF);
- if (*preg->regparse == ']' || *preg->regparse == '-')
- regc(preg, *preg->regparse++);
- while (*preg->regparse != '\0' && *preg->regparse != ']') {
- if (*preg->regparse == '-') {
- preg->regparse++;
- if (*preg->regparse == ']' || *preg->regparse == '\0')
- regc(preg, '-');
- else {
- reg_addrange(preg, UCHARAT(preg->regparse-2), UCHARAT(preg->regparse));
- preg->regparse++;
+
+ /* Special case. If the first char is ']' or '-', it is part of the set */
+ if (*pattern == ']' || *pattern == '-') {
+ reg_addrange(preg, *pattern, *pattern);
+ pattern++;
+ }
+
+ while (*pattern && *pattern != ']') {
+ /* Is this a range? a-z */
+ int start;
+ int end;
+
+ pattern += utf8_tounicode_case(pattern, &start, nocase);
+ if (start == '\\') {
+ pattern += reg_decode_escape(pattern, &start);
+ if (start == 0) {
+ preg->err = REG_ERR_NULL_CHAR;
+ return NULL;
}
- } else if (strncmp(preg->regparse, "[:alpha:]", 9) == 0) {
- reg_addrange(preg,'a','z');
- reg_addrange(preg,'A','Z');
- preg->regparse += 9;
- } else if (strncmp(preg->regparse, "[:alnum:]", 9) == 0) {
- reg_addrange(preg,'a','z');
- reg_addrange(preg,'A','Z');
- reg_addrange(preg,'0','9');
- preg->regparse += 9;
- } else if (strncmp(preg->regparse, "[:space:]", 9) == 0) {
- reg_addstr(preg," \t\r\n\f\v");
- preg->regparse += 9;
- } else
- regc(preg, *preg->regparse++);
+ }
+ if (pattern[0] == '-' && pattern[1]) {
+ /* skip '-' */
+ pattern += utf8_tounicode(pattern, &end);
+ pattern += utf8_tounicode_case(pattern, &end, nocase);
+ if (end == '\\') {
+ pattern += reg_decode_escape(pattern, &end);
+ if (end == 0) {
+ preg->err = REG_ERR_NULL_CHAR;
+ return NULL;
+ }
+ }
+
+ reg_addrange(preg, start, end);
+ continue;
+ }
+ if (start == '[') {
+ if (strncmp(pattern, ":alpha:]", 8) == 0) {
+ if ((preg->cflags & REG_ICASE) == 0) {
+ reg_addrange(preg, 'a', 'z');
+ }
+ reg_addrange(preg, 'A', 'Z');
+ pattern += 8;
+ continue;
+ }
+ if (strncmp(pattern, ":alnum:]", 8) == 0) {
+ if ((preg->cflags & REG_ICASE) == 0) {
+ reg_addrange(preg, 'a', 'z');
+ }
+ reg_addrange(preg, 'A', 'Z');
+ reg_addrange(preg, '0', '9');
+ pattern += 8;
+ continue;
+ }
+ if (strncmp(pattern, ":space:]", 8) == 0) {
+ reg_addrange_str(preg, " \t\r\n\f\v");
+ pattern += 8;
+ continue;
+ }
+ }
+ /* Not a range, so just add the char */
+ reg_addrange(preg, start, start);
}
regc(preg, '\0');
- if (*preg->regparse == ']') {
- preg->regparse++;
+
+ if (*pattern) {
+ pattern++;
}
+ preg->regparse = pattern;
+
*flagp |= HASWIDTH|SIMPLE;
}
break;
@@ -559,10 +853,8 @@ static char *regatom(regex_t *preg, int *flagp )
case '?':
case '+':
case '*':
- preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
- return NULL;
case '{':
- preg->err = REG_ERR_COUNT_UNSUPPORTED;
+ preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
return NULL;
case '\\':
switch (*preg->regparse++) {
@@ -571,9 +863,11 @@ static char *regatom(regex_t *preg, int *flagp )
return NULL;
break;
case '<':
+ case 'm':
ret = regnode(preg, WORDA);
break;
case '>':
+ case 'M':
ret = regnode(preg, WORDZ);
break;
case 'd':
@@ -584,103 +878,97 @@ static char *regatom(regex_t *preg, int *flagp )
break;
case 'w':
ret = regnode(preg, ANYOF);
- reg_addrange(preg, 'a', 'z');
+ if ((preg->cflags & REG_ICASE) == 0) {
+ reg_addrange(preg, 'a', 'z');
+ }
reg_addrange(preg, 'A', 'Z');
reg_addrange(preg, '0', '9');
- regc(preg, '_');
+ reg_addrange(preg, '_', '_');
regc(preg, '\0');
*flagp |= HASWIDTH|SIMPLE;
break;
case 's':
ret = regnode(preg, ANYOF);
- reg_addstr(preg," \t\r\n\f\v");
+ reg_addrange_str(preg," \t\r\n\f\v");
regc(preg, '\0');
*flagp |= HASWIDTH|SIMPLE;
break;
/* FIXME: Someday handle \1, \2, ... */
default:
/* Handle general quoted chars in exact-match routine */
+ /* Back up to include the backslash */
+ preg->regparse--;
goto de_fault;
}
break;
de_fault:
- default:
- /*
- * Encode a string of characters to be matched exactly.
- *
- * This is a bit tricky due to quoted chars and due to
- * '*', '+', and '?' taking the SINGLE char previous
- * as their operand.
- *
- * On entry, the char at regparse[-1] is going to go
- * into the string, no matter what it is. (It could be
- * following a \ if we are entered from the '\' case.)
- *
- * Basic idea is to pick up a good char in ch and
- * examine the next char. If it's *+? then we twiddle.
- * If it's \ then we frozzle. If it's other magic char
- * we push ch and terminate the string. If none of the
- * above, we push ch on the string and go around again.
- *
- * regprev is used to remember where "the current char"
- * starts in the string, if due to a *+? we need to back
- * up and put the current char in a separate, 1-char, string.
- * When regprev is NULL, ch is the only char in the
- * string; this is used in *+? handling, and in setting
- * flags |= SIMPLE at the end.
- */
- {
- const char *regprev;
- char ch;
+ default: {
+ /*
+ * Encode a string of characters to be matched exactly.
+ */
+ int added = 0;
+
+ /* Back up to pick up the first char of interest */
+ preg->regparse -= n;
- preg->regparse--; /* Look at cur char */
ret = regnode(preg, EXACTLY);
- for ( regprev = 0 ; ; ) {
- ch = *preg->regparse++; /* Get current char */
- switch (*preg->regparse) { /* look at next one */
- default:
- regc(preg, ch); /* Add cur to string */
- break;
+ /* Note that a META operator such as ? or * consumes the
+ * preceding char.
+ * Thus we must be careful to look ahead by 2 and add the
+ * last char as it's own EXACTLY if necessary
+ */
+
+ /* Until end of string or a META char is reached */
+ while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
+ n = utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
+ if (ch == '\\' && preg->regparse[n]) {
+ /* Non-trailing backslash.
+ * Is this a special escape, or a regular escape?
+ */
+ if (strchr("<>mMwds", preg->regparse[n])) {
+ /* A special escape. All done with EXACTLY */
+ break;
+ }
+ /* Decode it. Note that we add the length for the escape
+ * sequence to the length for the backlash so we can skip
+ * the entire sequence, or not as required.
+ */
+ n += reg_decode_escape(preg->regparse + n, &ch);
+ if (ch == 0) {
+ preg->err = REG_ERR_NULL_CHAR;
+ return NULL;
+ }
+ }
+
+ /* Now we have one char 'ch' of length 'n'.
+ * Check to see if the following char is a MULT
+ */
- case '.': case '[': case '(':
- case ')': case '|': case '\n':
- case '$': case '^':
- case '\0':
- /* FIXME, $ and ^ should not always be magic */
- magic:
- regc(preg, ch); /* dump cur char */
- goto done; /* and we are done */
-
- case '?': case '+': case '*':
- if (!regprev) /* If just ch in str, */
- goto magic; /* use it */
- /* End mult-char string one early */
- preg->regparse = regprev; /* Back up parse */
- goto done;
-
- case '\\':
- regc(preg, ch); /* Cur char OK */
- switch (preg->regparse[1]){ /* Look after \ */
- case '\0':
- case '<':
- case '>':
- /* FIXME: Someday handle \1, \2, ... */
- goto done; /* Not quoted */
- default:
- /* Backup point is \, scan * point is after it. */
- regprev = preg->regparse;
- preg->regparse++;
- continue; /* NOT break; */
+ if (ISMULT(preg->regparse[n])) {
+ /* Yes. But do we already have some EXACTLY chars? */
+ if (added) {
+ /* Yes, so return what we have and pick up the current char next time around */
+ break;
}
+ /* No, so add this single char and finish */
+ regc(preg, ch);
+ added++;
+ preg->regparse += n;
+ break;
}
- regprev = preg->regparse; /* Set backup point */
+
+ /* No, so just add this char normally */
+ regc(preg, ch);
+ added++;
+ preg->regparse += n;
}
- done:
regc(preg, '\0');
+
*flagp |= HASWIDTH;
- if (!regprev) /* One char? */
+ if (added == 1)
*flagp |= SIMPLE;
+ break;
}
break;
}
@@ -692,21 +980,20 @@ static char *regatom(regex_t *preg, int *flagp )
- regnode - emit a node
*/
/* Location. */
-static char *regnode(regex_t *preg, int op)
+static int *regnode(regex_t *preg, int op)
{
- char *ret;
- char *ptr;
+ int *ret;
+ int *ptr;
+ preg->regsize += 2;
ret = preg->regcode;
if (ret == &regdummy) {
- preg->regsize += 3;
return(ret);
}
ptr = ret;
*ptr++ = op;
- *ptr++ = '\0'; /* Null "next" pointer. */
- *ptr++ = '\0';
+ *ptr++ = 0; /* Null "next" pointer. */
preg->regcode = ptr;
return(ret);
@@ -717,47 +1004,49 @@ static char *regnode(regex_t *preg, int op)
*/
static void regc(regex_t *preg, int b )
{
+ preg->regsize++;
if (preg->regcode != &regdummy)
*preg->regcode++ = b;
- else
- preg->regsize++;
}
/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
+ * Returns the new location of the original operand.
*/
-static void reginsert(regex_t *preg, char op, char *opnd )
+static int *reginsert(regex_t *preg, int op, int *opnd )
{
- char *src;
- char *dst;
- char *place;
+ int *src;
+ int *dst;
+ int *place;
+
+ preg->regsize += 2;
if (preg->regcode == &regdummy) {
- preg->regsize += 3;
- return;
+ return opnd;
}
src = preg->regcode;
- preg->regcode += 3;
+ preg->regcode += 2;
dst = preg->regcode;
while (src > opnd)
*--dst = *--src;
place = opnd; /* Op node, where operand used to be. */
*place++ = op;
- *place++ = '\0';
- *place = '\0';
+ *place++ = 0;
+
+ return place;
}
/*
- regtail - set the next-pointer at the end of a node chain
*/
-static void regtail(regex_t *preg, char *p, const char *val )
+static void regtail(regex_t *preg, int *p, const int *val )
{
- char *scan;
- char *temp;
+ int *scan;
+ int *temp;
int offset;
if (p == &regdummy)
@@ -766,7 +1055,7 @@ static void regtail(regex_t *preg, char *p, const char *val )
/* Find last node. */
scan = p;
for (;;) {
- temp = (char *)regnext(preg, scan);
+ temp = (int *)regnext(preg, scan);
if (temp == NULL)
break;
scan = temp;
@@ -776,15 +1065,15 @@ static void regtail(regex_t *preg, char *p, const char *val )
offset = scan - val;
else
offset = val - scan;
- *(scan+1) = (offset>>8)&0377;
- *(scan+2) = offset&0377;
+
+ scan[1] = offset;
}
/*
- regoptail - regtail on operand of first argument; nop if operandless
*/
-static void regoptail(regex_t *preg, char *p, const char *val )
+static void regoptail(regex_t *preg, int *p, const int *val )
{
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p == NULL || p == &regdummy || OP(p) != BRANCH)
@@ -800,8 +1089,8 @@ static void regoptail(regex_t *preg, char *p, const char *val )
* Forwards.
*/
static int regtry(regex_t *preg, const char *string );
-static int regmatch(regex_t *preg, const char *prog);
-static int regrepeat(regex_t *preg, const char *p );
+static int regmatch(regex_t *preg, const int *prog);
+static int regrepeat(regex_t *preg, const int *p );
/*
- regexec - match a regexp against a string
@@ -816,7 +1105,7 @@ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmat
}
/* Check validity of program. */
- if (UCHARAT(preg->program) != REG_MAGIC) {
+ if (*preg->program != REG_MAGIC) {
return REG_ERR_CORRUPTED;
}
@@ -833,8 +1122,9 @@ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmat
if (preg->regmust != NULL) {
s = string;
while ((s = str_find(s, preg->regmust[0], preg->cflags & REG_ICASE)) != NULL) {
- if (strncmp(s, preg->regmust, preg->regmlen) == 0)
- break; /* Found it. */
+ if (prefix_cmp(preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
+ break;
+ }
s++;
}
if (s == NULL) /* Not present. */
@@ -914,48 +1204,86 @@ static int regtry( regex_t *preg, const char *string )
}
/**
- * Returns 1 if 'pattern' is a prefix of 'string'.
+ * Returns bytes matched if 'pattern' is a prefix of 'string'.
*
* If 'nocase' is non-zero, does a case-insensitive match.
+ *
+ * Returns -1 on not found.
*/
-static int prefix_cmp(const char *pattern, const char *string, int nocase)
+static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
{
- if (nocase) {
- while (*pattern && *string) {
- if (toupper(*pattern) != toupper(*string)) {
- break;
- }
- pattern++;
- string++;
+ const char *s = string;
+ while (proglen && *s) {
+ int ch;
+ int n = utf8_tounicode_case(s, &ch, nocase);
+ if (ch != *prog) {
+ return -1;
}
- return *pattern == 0;
+ prog++;
+ s += n;
+ proglen--;
}
- else {
- if (*pattern == *string) {
- int len = strlen(pattern);
- return strncmp(pattern, string, len) == 0;
- }
- return 0;
+ if (proglen == 0) {
+ return s - string;
}
+ return -1;
}
-static const char *str_find(const char *string, char c, int nocase)
+/**
+ * Searchs for 'c' in the range 'range'.
+ *
+ * If 'nocase' is set, the range is assumed to be uppercase
+ * and 'c' is converted to uppercase before matching.
+ *
+ * Returns 1 if found, or 0 if not.
+ */
+static int reg_range_find(const int *range, int c, int nocase)
{
if (nocase) {
- c = toupper(c);
- while (*string) {
- if (toupper(*string) == c) {
- return string;
- }
- string++;
+ /* The "string" should already be converted to uppercase */
+ c = utf8_upper(c);
+ }
+ while (*range) {
+ if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
+ return 1;
}
- return NULL;
+ range += 2;
}
- else {
- return strchr(string, c);
+ return 0;
+}
+
+/**
+ * Search for the character 'c' in the utf-8 string 'string'.
+ *
+ * If 'nocase' is set, the 'string' is assumed to be uppercase
+ * and 'c' is converted to uppercase before matching.
+ *
+ * Returns the byte position in the string where the 'c' was found, or
+ * NULL if not found.
+ */
+static const char *str_find(const char *string, int c, int nocase)
+{
+ if (nocase) {
+ /* The "string" should already be converted to uppercase */
+ c = utf8_upper(c);
}
+ while (*string) {
+ int ch;
+ int n = utf8_tounicode_case(string, &ch, nocase);
+ if (c == ch) {
+ return string;
+ }
+ string += n;
+ }
+ return NULL;
}
+/**
+ * Returns true if 'ch' is an end-of-line char.
+ *
+ * In REG_NEWLINE mode, \n is considered EOL in
+ * addition to \0
+ */
static int reg_iseol(regex_t *preg, int ch)
{
if (preg->cflags & REG_NEWLINE) {
@@ -977,10 +1305,10 @@ static int reg_iseol(regex_t *preg, int ch)
* by recursion.
*/
/* 0 failure, 1 success */
-static int regmatch(regex_t *preg, const char *prog)
+static int regmatch(regex_t *preg, const int *prog)
{
- const char *scan; /* Current node. */
- const char *next; /* Next node. */
+ const int *scan; /* Current node. */
+ const int *next; /* Next node. */
scan = prog;
#ifdef DEBUG
@@ -1025,27 +1353,31 @@ static int regmatch(regex_t *preg, const char *prog)
preg->reginput++;
break;
case EXACTLY: {
- const char *opnd;
+ const int *opnd;
+ int len;
+ int slen;
opnd = OPERAND(scan);
+ len = str_int_len(opnd);
- if (prefix_cmp(opnd, preg->reginput, preg->cflags & REG_ICASE) == 0) {
+ slen = prefix_cmp(opnd, len, preg->reginput, preg->cflags & REG_ICASE);
+ if (slen < 0) {
return(0);
}
- preg->reginput += strlen(opnd);
+ preg->reginput += slen;
}
break;
case ANYOF:
if (reg_iseol(preg, *preg->reginput))
return 0;
- if (str_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) == NULL)
+ if (reg_range_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) == 0)
return(0);
preg->reginput++;
break;
case ANYBUT:
if (reg_iseol(preg, *preg->reginput))
return 0;
- if (str_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) != NULL)
+ if (reg_range_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) != 0)
return(0);
preg->reginput++;
break;
@@ -1089,8 +1421,10 @@ static int regmatch(regex_t *preg, const char *prog)
save = preg->reginput;
no = regrepeat(preg, OPERAND(scan));
while (no >= min) {
+ int ch;
+ utf8_tounicode_case(preg->reginput, &ch, (preg->cflags & REG_ICASE));
/* If it could work, try it. */
- if (reg_iseol(preg, nextch) || *preg->reginput == nextch)
+ if (reg_iseol(preg, nextch) || ch == nextch)
if (regmatch(preg, next))
return(1);
/* Couldn't or didn't -- back up. */
@@ -1148,11 +1482,11 @@ static int regmatch(regex_t *preg, const char *prog)
/*
- regrepeat - repeatedly match something simple, report how many
*/
-static int regrepeat(regex_t *preg, const char *p )
+static int regrepeat(regex_t *preg, const int *p )
{
int count = 0;
const char *scan;
- const char *opnd;
+ const int *opnd;
scan = preg->reginput;
opnd = OPERAND(p);
@@ -1165,9 +1499,14 @@ static int regrepeat(regex_t *preg, const char *p )
break;
case EXACTLY:
if (preg->cflags & REG_ICASE) {
- while (toupper(*opnd) == toupper(*scan)) {
+ while (1) {
+ int ch;
+ int n = utf8_tounicode_case(scan, &ch, 1);
+ if (*opnd != ch) {
+ break;
+ }
count++;
- scan++;
+ scan += n;
}
}
else {
@@ -1178,13 +1517,13 @@ static int regrepeat(regex_t *preg, const char *p )
}
break;
case ANYOF:
- while (!reg_iseol(preg, *scan) && str_find(opnd, *scan, preg->cflags & REG_ICASE) != NULL) {
+ while (!reg_iseol(preg, *scan) && reg_range_find(opnd, *scan, preg->cflags & REG_ICASE) != 0) {
count++;
scan++;
}
break;
case ANYBUT:
- while (!reg_iseol(preg, *scan) && str_find(opnd, *scan, preg->cflags & REG_ICASE) == NULL) {
+ while (!reg_iseol(preg, *scan) && reg_range_find(opnd, *scan, preg->cflags & REG_ICASE) == 0) {
count++;
scan++;
}
@@ -1202,7 +1541,7 @@ static int regrepeat(regex_t *preg, const char *p )
/*
- regnext - dig the "next" pointer out of a node
*/
-static const char *regnext(regex_t *preg, const char *p )
+static const int *regnext(regex_t *preg, const int *p )
{
int offset;
@@ -1226,25 +1565,45 @@ static const char *regnext(regex_t *preg, const char *p )
*/
static void regdump(regex_t *preg)
{
- const char *s;
+ const int *s;
char op = EXACTLY; /* Arbitrary non-END op. */
- const char *next;
+ const int *next;
+ char buf[4];
+ if (preg->regcode == &regdummy)
+ return;
s = preg->program + 1;
- while (op != END) { /* While that wasn't END last time... */
+ while (op != END && s < preg->regcode) { /* While that wasn't END last time... */
op = OP(s);
- printf("%2d%s", (int)(s-preg->program), regprop(s)); /* Where, what. */
+ printf("%2d{%02x}%s", (int)(s-preg->program), op, regprop(s)); /* Where, what. */
next = regnext(preg, s);
if (next == NULL) /* Next ptr. */
printf("(0)");
else
printf("(%d)", (int)((s-preg->program)+(next-s)));
- s += 3;
- if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
+ s += 2;
+ if (op == ANYOF || op == ANYBUT) {
+ /* set of ranges */
+
+ while (*s) {
+ int len = *s++;
+ int first = *s++;
+ buf[utf8_fromunicode(buf, first)] = 0;
+ printf("%s", buf);
+ if (len > 1) {
+ buf[utf8_fromunicode(buf, first + len - 1)] = 0;
+ printf("-%s", buf);
+ }
+ }
+ s++;
+ }
+ else if (op == EXACTLY) {
/* Literal string, where present. */
- while (*s != '\0') {
- putchar(*s);
+
+ while (*s) {
+ buf[utf8_fromunicode(buf, *s)] = 0;
+ printf("%s", buf);
s++;
}
s++;
@@ -1252,20 +1611,29 @@ static void regdump(regex_t *preg)
putchar('\n');
}
- /* Header fields of interest. */
- if (preg->regstart != '\0')
- printf("start `%c' ", preg->regstart);
- if (preg->reganch)
- printf("anchored ");
- if (preg->regmust != NULL)
- printf("must have \"%s\"", preg->regmust);
+ if (op == END) {
+ /* Header fields of interest. */
+ if (preg->regstart != '\0')
+ buf[utf8_fromunicode(buf, preg->regstart)] = 0;
+ printf("start '%s' ", buf);
+ if (preg->reganch)
+ printf("anchored ");
+ if (preg->regmust != NULL) {
+ int i;
+ printf("must have:");
+ for (i = 0; i < preg->regmlen; i++) {
+ putchar(preg->regmust[i]);
+ }
+ putchar('\n');
+ }
+ }
printf("\n");
}
/*
- regprop - printable representation of opcode
*/
-static const char *regprop( const char *op )
+static const char *regprop( const int *op )
{
char *p;
static char buf[50];
@@ -1346,6 +1714,8 @@ size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_s
"out of memory",
"too many ()",
"parentheses () not balanced",
+ "braces {} not balanced",
+ "invalid repetition count(s)",
"extra characters",
"*+ of empty atom",
"nested count",
@@ -1353,7 +1723,7 @@ size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_s
"count follows nothing",
"trailing backslash",
"corrupted program",
- "braces {} not supported"
+ "contains null char",
};
const char *err;
diff --git a/jimregexp.h b/jimregexp.h
index 751bbf7..5d7dc94 100644
--- a/jimregexp.h
+++ b/jimregexp.h
@@ -50,15 +50,15 @@ typedef struct regexp {
/* -- private -- */
int cflags; /* Flags used when compiling */
int err; /* Any error which occurred during compile */
- char regstart; /* Internal use only. */
- char reganch; /* Internal use only. */
- const char *regmust; /* Internal use only. */
+ int regstart; /* Internal use only. */
+ int reganch; /* Internal use only. */
+ const int *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
- char *program; /* Allocated */
+ int *program; /* Allocated */
/* working state - compile */
const char *regparse; /* Input-scan pointer. */
- char *regcode; /* Code-emit pointer; &regdummy = don't. */
+ int *regcode; /* Code-emit pointer; &regdummy = don't. */
long regsize; /* Code size. */
/* working state - exec */
@@ -90,6 +90,8 @@ enum {
REG_ERR_NOMEM,
REG_ERR_TOO_MANY_PAREN,
REG_ERR_UNMATCHED_PAREN,
+ REG_ERR_UNMATCHED_BRACES,
+ REG_ERR_BAD_COUNT,
REG_ERR_JUNK_ON_END,
REG_ERR_OPERAND_COULD_BE_EMPTY,
REG_ERR_NESTED_COUNT,
@@ -97,7 +99,7 @@ enum {
REG_ERR_COUNT_FOLLOWS_NOTHING,
REG_ERR_TRAILING_BACKSLASH,
REG_ERR_CORRUPTED,
- REG_ERR_COUNT_UNSUPPORTED,
+ REG_ERR_NULL_CHAR,
REG_ERR_NUM
};
diff --git a/jimsh.c b/jimsh.c
index 9b69def..1f161c4 100644
--- a/jimsh.c
+++ b/jimsh.c
@@ -94,6 +94,9 @@ int main(int argc, char *const argv[])
if (argc > 2 && strcmp(argv[1], "-e") == 0) {
JimSetArgv(interp, argc - 3, argv + 3);
retcode = Jim_Eval(interp, argv[2]);
+ if (retcode != JIM_ERR) {
+ printf("%s\n", Jim_GetString(Jim_GetResult(interp), NULL));
+ }
}
else {
Jim_SetVariableStr(interp, "argv0", Jim_NewStringObj(interp, argv[1], -1));
diff --git a/tests/Makefile b/tests/Makefile
index 285fb9e..02c6c1e 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -9,4 +9,4 @@ test:
make -C .. all
clean:
- rm -f gorp.file2 cat gorp.file sleep exit wc sh echo
+ rm -f gorp.file2 cat gorp.file sleep exit wc sh echo test1
diff --git a/tests/regcount.test b/tests/regcount.test
new file mode 100644
index 0000000..4b4d106
--- /dev/null
+++ b/tests/regcount.test
@@ -0,0 +1,35 @@
+source testing.tcl
+
+# Test regexp counted repetitions
+
+set n 0
+foreach {pat str exp} {
+ a+ bac 1
+ a{1,} bac 1
+ a* bac 1
+ a{0,} bac 1
+ aa+ bac 0
+ a{2,} bac 0
+ a{2,} bacaad 1
+ a{3,} bacaad 0
+ a{2,}$ bacaad 0
+ a{2,}$ bacaa 1
+ a{2,}$ ba 0
+ a{2,}$ aa 1
+ a{0,0}b$ b 1
+ a{1,1}b$ b 0
+ a{1,1}b$ cab 1
+ a{2,2}b$ cab 0
+ a{2,2}b$ cabaabx 0
+ a{2,2}b$ cacaab 1
+ ca{2,4}b cacaab 1
+ ca{2,3}b cacaab 1
+ ca{2,3}b cacaaab 1
+ c(a|b){2,3}d xcbad 1
+ c(a|b){2,3}d xcabbd 1
+ c(a|b){2,3}d xcbaaad 0
+} {
+ test regcount-1.[incr n] "Test: regexp $pat" [list regexp -- $pat $str] $exp
+}
+
+testreport
diff --git a/tests/regexp2.test b/tests/regexp2.test
new file mode 100644
index 0000000..a061e8b
--- /dev/null
+++ b/tests/regexp2.test
@@ -0,0 +1,909 @@
+# Commands covered: regexp, regsub
+#
+# This file contains a collection of tests for one or more of the Tcl
+# built-in commands. Sourcing this file into Tcl runs the tests and
+# generates output for errors. No output means no errors were found.
+#
+# Copyright (c) 1991-1993 The Regents of the University of California.
+# Copyright (c) 1998 Sun Microsystems, Inc.
+# Copyright (c) 1998-1999 by Scriptics Corporation.
+#
+# See the file "license.terms" for information on usage and redistribution
+# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+#
+# RCS: @(#) $Id$
+
+source testing.tcl
+
+catch {package require regexp}
+if {[info commands regexp] eq "" || [regexp {\d} 1] == 0} {
+ # No regexp, or not using a sufficiently capable regexp implementation
+ puts " --- skipped"
+ exit 0
+}
+
+
+# Procedure to evaluate a script within a proc, to test compilation
+# functionality
+
+proc evalInProc { script } {
+ proc testProc {} $script
+ set status [catch {
+ testProc
+ } result]
+ rename testProc {}
+ return $result
+ #return [list $status $result]
+}
+
+catch {unset foo}
+test regexpComp-1.1 {basic regexp operation} {
+ evalInProc {
+ regexp ab*c abbbc
+ }
+} 1
+test regexpComp-1.2 {basic regexp operation} {
+ evalInProc {
+ regexp ab*c ac
+ }
+} 1
+test regexpComp-1.3 {basic regexp operation} {
+ evalInProc {
+ regexp ab*c ab
+ }
+} 0
+test regexpComp-1.4 {basic regexp operation} {
+ evalInProc {
+ regexp -- -gorp abc-gorpxxx
+ }
+} 1
+test regexpComp-1.5 {basic regexp operation} {
+ evalInProc {
+ regexp {^([^ ]*)[ ]*([^ ]*)} "" a
+ }
+} 1
+test regexpComp-1.6 {basic regexp operation} {
+ list [catch {regexp {} abc} msg] $msg
+} {0 1}
+test regexpComp-1.7 {regexp utf compliance} {
+ # if not UTF-8 aware, result is "0 1"
+ evalInProc {
+ set foo "\u4e4eb q"
+ regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar
+ list [string compare $foo $bar] [regexp 4 $bar]
+ }
+} {0 0}
+
+test regexpComp-2.1 {getting substrings back from regexp} {
+ evalInProc {
+ set foo {}
+ list [regexp ab*c abbbbc foo] $foo
+ }
+} {1 abbbbc}
+test regexpComp-2.2 {getting substrings back from regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ list [regexp a(b*)c abbbbc foo f2] $foo $f2
+ }
+} {1 abbbbc bbbb}
+test regexpComp-2.3 {getting substrings back from regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ list [regexp a(b*)(c) abbbbc foo f2] $foo $f2
+ }
+} {1 abbbbc bbbb}
+test regexpComp-2.4 {getting substrings back from regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ set f3 {}
+ list [regexp a(b*)(c) abbbbc foo f2 f3] $foo $f2 $f3
+ }
+} {1 abbbbc bbbb c}
+test regexpComp-2.5 {getting substrings back from regexp} {
+ evalInProc {
+ set foo {}; set f1 {}; set f2 {}; set f3 {}; set f4 {}; set f5 {};
+ set f6 {}; set f7 {}; set f8 {}; set f9 {}; set fa {}; set fb {};
+ list [regexp (1*)(2*)(3*)(4*)(5*)(6*)(7*)(8*)(9*)(a*)(b*) \
+ 12223345556789999aabbb \
+ foo f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb] $foo $f1 $f2 $f3 $f4 $f5 \
+ $f6 $f7 $f8 $f9 $fa $fb
+ }
+} {1 12223345556789999aabbb 1 222 33 4 555 6 7 8 9999 aa bbb}
+test regexpComp-2.6 {getting substrings back from regexp} {
+ evalInProc {
+ set foo 2; set f2 2; set f3 2; set f4 2
+ list [regexp (a)(b)? xay foo f2 f3 f4] $foo $f2 $f3 $f4
+ }
+} {1 a a {} {}}
+test regexpComp-2.7 {getting substrings back from regexp} {
+ evalInProc {
+ set foo 1; set f2 1; set f3 1; set f4 1
+ list [regexp (a)(b)?(c) xacy foo f2 f3 f4] $foo $f2 $f3 $f4
+ }
+} {1 ac a {} c}
+test regexpComp-2.8 {getting substrings back from regexp} {
+ evalInProc {
+ set match {}
+ list [regexp {^a*b} aaaab match] $match
+ }
+} {1 aaaab}
+
+test regexpComp-3.1 {-indices option to regexp} {
+ evalInProc {
+ set foo {}
+ list [regexp -indices ab*c abbbbc foo] $foo
+ }
+} {1 {0 5}}
+test regexpComp-3.2 {-indices option to regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ list [regexp -indices a(b*)c abbbbc foo f2] $foo $f2
+ }
+} {1 {0 5} {1 4}}
+test regexpComp-3.3 {-indices option to regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ list [regexp -indices a(b*)(c) abbbbc foo f2] $foo $f2
+ }
+} {1 {0 5} {1 4}}
+test regexpComp-3.4 {-indices option to regexp} {
+ evalInProc {
+ set foo {}
+ set f2 {}
+ set f3 {}
+ list [regexp -indices a(b*)(c) abbbbc foo f2 f3] $foo $f2 $f3
+ }
+} {1 {0 5} {1 4} {5 5}}
+test regexpComp-3.5 {-indices option to regexp} {
+ evalInProc {
+ set foo {}; set f1 {}; set f2 {}; set f3 {}; set f4 {}; set f5 {};
+ set f6 {}; set f7 {}; set f8 {}; set f9 {}
+ list [regexp -indices (1*)(2*)(3*)(4*)(5*)(6*)(7*)(8*)(9*) \
+ 12223345556789999 \
+ foo f1 f2 f3 f4 f5 f6 f7 f8 f9] $foo $f1 $f2 $f3 $f4 $f5 \
+ $f6 $f7 $f8 $f9
+ }
+} {1 {0 16} {0 0} {1 3} {4 5} {6 6} {7 9} {10 10} {11 11} {12 12} {13 16}}
+test regexpComp-3.6 {getting substrings back from regexp} {
+ evalInProc {
+ set foo 2; set f2 2; set f3 2; set f4 2
+ list [regexp -indices (a)(b)? xay foo f2 f3 f4] $foo $f2 $f3 $f4
+ }
+} {1 {1 1} {1 1} {-1 -1} {-1 -1}}
+test regexpComp-3.7 {getting substrings back from regexp} {
+ evalInProc {
+ set foo 1; set f2 1; set f3 1; set f4 1
+ list [regexp -indices (a)(b)?(c) xacy foo f2 f3 f4] $foo $f2 $f3 $f4
+ }
+} {1 {1 2} {1 1} {-1 -1} {2 2}}
+
+test regexpComp-4.1 {-nocase option to regexp} {
+ evalInProc {
+ regexp -nocase foo abcFOo
+ }
+} 1
+test regexpComp-4.2 {-nocase option to regexp} {
+ evalInProc {
+ set f1 22
+ set f2 33
+ set f3 44
+ list [regexp -nocase {a(b*)([xy]*)z} aBbbxYXxxZ22 f1 f2 f3] $f1 $f2 $f3
+ }
+} {1 aBbbxYXxxZ Bbb xYXxx}
+test regexpComp-4.3 {-nocase option to regexp} {
+ evalInProc {
+ regexp -nocase FOo abcFOo
+ }
+} 1
+set ::x abcdefghijklmnopqrstuvwxyz1234567890
+set ::x $x$x$x$x$x$x$x$x$x$x$x$x
+test regexpComp-4.4 {case conversion in regexp} {
+ evalInProc {
+ list [regexp -nocase $::x $::x foo] $foo
+ }
+} "1 $x"
+catch {unset ::x}
+
+test regexpComp-5.1 {exercise cache of compiled expressions} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ regexp .*a bbba
+ }
+} 1
+test regexpComp-5.2 {exercise cache of compiled expressions} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ regexp .*b xxxb
+ }
+} 1
+test regexpComp-5.3 {exercise cache of compiled expressions} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ regexp .*c yyyc
+ }
+} 1
+test regexpComp-5.4 {exercise cache of compiled expressions} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ regexp .*d 1d
+ }
+} 1
+test regexpComp-5.5 {exercise cache of compiled expressions} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ regexp .*e xe
+ }
+} 1
+
+test regexpComp-6.4 {regexp errors} {
+ evalInProc {
+ list [catch {regexp a( b} msg] $msg
+ }
+} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
+test regexpComp-6.5 {regexp errors} {
+ evalInProc {
+ list [catch {regexp a( b} msg] $msg
+ }
+} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
+test regexpComp-6.6 {regexp errors} {
+ evalInProc {
+ list [catch {regexp a a f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1} msg] $msg
+ }
+} {0 1}
+test regexpComp-6.7 {regexp errors} {
+ evalInProc {
+ list [catch {regexp (x)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) xyzzy} msg] $msg
+ }
+} {0 0}
+test regexpComp-6.8 {regexp errors} {
+ evalInProc {
+ catch {unset f1}
+ set f1 44
+ catch {regexp abc abc f1(f2)} msg
+ }
+} {1}
+test regexpComp-6.9 {regexp errors, -start bad int check} {
+ evalInProc {
+ list [catch {regexp -start bogus {^$} {}} msg] $msg
+ }
+} {1 {bad index "bogus": must be integer?[+-]integer? or end?[+-]integer?}}
+
+test regexpComp-7.1 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ xaxaaaxaa 111&222 foo] $foo
+ }
+} {1 xax111aaa222xaa}
+test regexpComp-7.2 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ aaaxaa &111 foo] $foo
+ }
+} {1 aaa111xaa}
+test regexpComp-7.3 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ xaxaaa 111& foo] $foo
+ }
+} {1 xax111aaa}
+test regexpComp-7.4 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ aaa 11&2&333 foo] $foo
+ }
+} {1 11aaa2aaa333}
+test regexpComp-7.5 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ xaxaaaxaa &2&333 foo] $foo
+ }
+} {1 xaxaaa2aaa333xaa}
+test regexpComp-7.6 {basic regsub operation} {
+ evalInProc {
+ list [regsub aa+ xaxaaaxaa 1&22& foo] $foo
+ }
+} {1 xax1aaa22aaaxaa}
+test regexpComp-7.7 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {1\122\1} foo] $foo
+ }
+} {1 xax1aa22aaxaa}
+test regexpComp-7.8 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {1\\\122\1} foo] $foo
+ }
+} "1 {xax1\\aa22aaxaa}"
+test regexpComp-7.9 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {1\\122\1} foo] $foo
+ }
+} "1 {xax1\\122aaxaa}"
+test regexpComp-7.10 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {1\\&\1} foo] $foo
+ }
+} "1 {xax1\\aaaaaxaa}"
+test regexpComp-7.11 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {1\&\1} foo] $foo
+ }
+} {1 xax1&aaxaa}
+test regexpComp-7.12 {basic regsub operation} {
+ evalInProc {
+ list [regsub a(a+) xaxaaaxaa {\1\1\1\1&&} foo] $foo
+ }
+} {1 xaxaaaaaaaaaaaaaaxaa}
+test regexpComp-7.13 {basic regsub operation} {
+ evalInProc {
+ set foo xxx
+ list [regsub abc xyz 111 foo] $foo
+ }
+} {0 xyz}
+test regexpComp-7.14 {basic regsub operation} {
+ evalInProc {
+ set foo xxx
+ list [regsub ^ xyz "111 " foo] $foo
+ }
+} {1 {111 xyz}}
+test regexpComp-7.15 {basic regsub operation} {
+ evalInProc {
+ set foo xxx
+ list [regsub -- -foo abc-foodef "111 " foo] $foo
+ }
+} {1 {abc111 def}}
+test regexpComp-7.16 {basic regsub operation} {
+ evalInProc {
+ set foo xxx
+ list [regsub x "" y foo] $foo
+ }
+} {0 {}}
+test regexpComp-7.17 {regsub utf compliance} {
+ evalInProc {
+ # if not UTF-8 aware, result is "0 1"
+ set foo "xyz555ijka\u4e4ebpqr"
+ regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar
+ list [string compare $foo $bar] [regexp 4 $bar]
+ }
+} {0 0}
+
+test regexpComp-8.1 {case conversion in regsub} {
+ evalInProc {
+ list [regsub -nocase a(a+) xaAAaAAay & foo] $foo
+ }
+} {1 xaAAaAAay}
+test regexpComp-8.2 {case conversion in regsub} {
+ evalInProc {
+ list [regsub -nocase a(a+) xaAAaAAay & foo] $foo
+ }
+} {1 xaAAaAAay}
+test regexpComp-8.3 {case conversion in regsub} {
+ evalInProc {
+ set foo 123
+ list [regsub a(a+) xaAAaAAay & foo] $foo
+ }
+} {0 xaAAaAAay}
+test regexpComp-8.4 {case conversion in regsub} {
+ evalInProc {
+ set foo 123
+ list [regsub -nocase a CaDE b foo] $foo
+ }
+} {1 CbDE}
+test regexpComp-8.5 {case conversion in regsub} {
+ evalInProc {
+ set foo 123
+ list [regsub -nocase XYZ CxYzD b foo] $foo
+ }
+} {1 CbD}
+test regexpComp-8.6 {case conversion in regsub} {
+ evalInProc {
+ set x abcdefghijklmnopqrstuvwxyz1234567890
+ set x $x$x$x$x$x$x$x$x$x$x$x$x
+ set foo 123
+ list [regsub -nocase $x $x b foo] $foo
+ }
+} {1 b}
+
+test regexpComp-9.1 {-all option to regsub} {
+ evalInProc {
+ set foo 86
+ list [regsub -all x+ axxxbxxcxdx |&| foo] $foo
+ }
+} {4 a|xxx|b|xx|c|x|d|x|}
+test regexpComp-9.2 {-all option to regsub} {
+ evalInProc {
+ set foo 86
+ list [regsub -nocase -all x+ aXxXbxxcXdx |&| foo] $foo
+ }
+} {4 a|XxX|b|xx|c|X|d|x|}
+test regexpComp-9.3 {-all option to regsub} {
+ evalInProc {
+ set foo 86
+ list [regsub x+ axxxbxxcxdx |&| foo] $foo
+ }
+} {1 a|xxx|bxxcxdx}
+test regexpComp-9.4 {-all option to regsub} {
+ evalInProc {
+ set foo 86
+ list [regsub -all bc axxxbxxcxdx |&| foo] $foo
+ }
+} {0 axxxbxxcxdx}
+test regexpComp-9.5 {-all option to regsub} {
+ evalInProc {
+ set foo xxx
+ list [regsub -all node "node node more" yy foo] $foo
+ }
+} {2 {yy yy more}}
+test regexpComp-9.6 {-all option to regsub} {
+ evalInProc {
+ set foo xxx
+ list [regsub -all ^ xxx 123 foo] $foo
+ }
+} {1 123xxx}
+
+#test regexpComp-10.1 {expanded syntax in regsub} {
+# evalInProc {
+# set foo xxx
+# list [regsub -expanded ". \#comment\n . \#comment2" abc def foo] $foo
+# }
+#} {1 defc}
+test regexpComp-10.2 {newline sensitivity in regsub} {
+ evalInProc {
+ set foo xxx
+ list [regsub -line {^a.*b$} "dabc\naxyb\n" 123 foo] $foo
+ }
+} "1 {dabc\n123\n}"
+test regexpComp-10.3 {newline sensitivity in regsub} {
+ evalInProc {
+ set foo xxx
+ list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo
+ }
+} "1 {dabc\n123\nxb}"
+#test regexpComp-10.4 {partial newline sensitivity in regsub} {
+# evalInProc {
+# set foo xxx
+# list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo
+# }
+#} "1 {da\n123}"
+#test regexpComp-10.5 {inverse partial newline sensitivity in regsub} {
+# evalInProc {
+# set foo xxx
+# list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo
+# }
+#} "1 {da\nb123\nxb}"
+
+#test regexpComp-11.1 {regsub errors} {
+# evalInProc {
+# list [catch {regsub a b} msg] $msg
+# }
+#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}}
+#test regexpComp-11.2 {regsub errors} {
+# evalInProc {
+# list [catch {regsub -nocase a b} msg] $msg
+# }
+#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}}
+#test regexpComp-11.3 {regsub errors} {
+# evalInProc {
+# list [catch {regsub -nocase -all a b} msg] $msg
+# }
+#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}}
+#test regexpComp-11.4 {regsub errors} {
+# evalInProc {
+# list [catch {regsub a b c d e f} msg] $msg
+# }
+#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}}
+#test regexpComp-11.5 {regsub errors} {
+# evalInProc {
+# list [catch {regsub -gorp a b c} msg] $msg
+# }
+#} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}}
+#test regexpComp-11.6 {regsub errors} {
+# evalInProc {
+# list [catch {regsub -nocase a( b c d} msg] $msg
+# }
+#} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
+test regexpComp-11.7 {regsub errors} {
+ evalInProc {
+ catch {unset f1}
+ set f1 44
+ catch {regsub -nocase aaa aaa xxx f1(f2)} msg
+ }
+} {1}
+test regexpComp-11.8 {regsub errors, -start bad int check} {
+ evalInProc {
+ list [catch {regsub -start bogus pattern string rep var} msg] $msg
+ }
+} {1 {bad index "bogus": must be integer?[+-]integer? or end?[+-]integer?}}
+
+# This test crashes on the Mac unless you increase the Stack Space to about 1
+# Meg. This is probably bigger than most users want...
+# 8.2.3 regexp reduced stack space requirements, but this should be
+# tested again
+test regexpComp-12.1 {Tcl_RegExpExec: large number of subexpressions} {
+ evalInProc {
+ list [regexp (.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) abcdefghijklmnopqrstuvwxyz all a b c d e f g h i j k l m n o p q r s t u v w x y z] $all $a $b $c $d $e $f $g $h $i $j $k $l $m $n $o $p $q $r $s $t $u $v $w $x $y $z
+ }
+} {1 abcdefghijklmnopqrstuvwxyz a b c d e f g h i j k l m n o p q r s t u v w x y z}
+
+test regexpComp-13.1 {regsub of a very large string} {
+ # This test is designed to stress the memory subsystem in order
+ # to catch Bug #933. It only fails if the Tcl memory allocator
+ # is in use.
+
+ set line {BEGIN_TABLE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; END_TABLE}
+ set filedata [string repeat $line 200]
+ for {set i 1} {$i<10} {incr i} {
+ regsub -all "BEGIN_TABLE " $filedata "" newfiledata
+ }
+ set x done
+} {done}
+
+test regexpComp-14.1 {CompileRegexp: regexp cache} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ set x .
+ append x *a
+ regexp $x bbba
+ }
+} 1
+test regexpComp-14.2 {CompileRegexp: regexp cache, different flags} {
+ evalInProc {
+ regexp .*a b
+ regexp .*b c
+ regexp .*c d
+ regexp .*d e
+ regexp .*e f
+ set x .
+ append x *a
+ regexp -nocase $x bbba
+ }
+} 1
+
+test regexpComp-15.1 {regexp -start} {
+ catch {unset x}
+ list [regexp -start -10 {\d} 1abc2de3 x] $x
+} {1 1}
+test regexpComp-15.2 {regexp -start} {
+ catch {unset x}
+ list [regexp -start 2 {\d} 1abc2de3 x] $x
+} {1 2}
+test regexpComp-15.3 {regexp -start} {
+ catch {unset x}
+ list [regexp -start 4 {\d} 1abc2de3 x] $x
+} {1 2}
+test regexpComp-15.4 {regexp -start} {
+ catch {unset x}
+ list [regexp -start 5 {\d} 1abc2de3 x] $x
+} {1 3}
+test regexpComp-15.5 {regexp -start, over end of string} {
+ catch {unset x}
+ list [regexp -start [string length 1abc2de3] {\d} 1abc2de3 x] [info exists x]
+} {0 0}
+test regexpComp-15.6 {regexp -start, loss of ^$ behavior} {
+ list [regexp -start 2 {^$} {}]
+} {0}
+
+test regexpComp-16.1 {regsub -start} {
+ catch {unset x}
+ list [regsub -all -start 2 {\d} a1b2c3d4e5 {/&} x] $x
+} {4 a1b/2c/3d/4e/5}
+test regexpComp-16.2 {regsub -start} {
+ catch {unset x}
+ list [regsub -all -start -25 {z} hello {/&} x] $x
+} {0 hello}
+test regexpComp-16.3 {regsub -start} {
+ catch {unset x}
+ list [regsub -all -start 3 {z} hello {/&} x] $x
+} {0 hello}
+#test regexpComp-16.4 {regsub -start, \A behavior} {
+# set out {}
+# lappend out [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x
+# lappend out [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x
+#} {5 /a/b/c/d/e 3 ab/c/d/e}
+
+test regexpComp-17.1 {regexp -inline} {
+ regexp -inline b ababa
+} {b}
+test regexpComp-17.2 {regexp -inline} {
+ regexp -inline (b) ababa
+} {b b}
+test regexpComp-17.3 {regexp -inline -indices} {
+ regexp -inline -indices (b) ababa
+} {{1 1} {1 1}}
+test regexpComp-17.4 {regexp -inline} {
+ regexp -inline {\w(\d+)\w} " hello 23 there456def "
+} {e456d 456}
+test regexpComp-17.5 {regexp -inline no matches} {
+ regexp -inline {\w(\d+)\w} ""
+} {}
+test regexpComp-17.6 {regexp -inline no matches} {
+ regexp -inline hello goodbye
+} {}
+test regexpComp-17.7 {regexp -inline, no matchvars allowed} {
+ list [catch {regexp -inline b abc match} msg] $msg
+} {1 {regexp match variables not allowed when using -inline}}
+
+test regexpComp-18.1 {regexp -all} {
+ regexp -all b bbbbb
+} {5}
+test regexpComp-18.2 {regexp -all} {
+ regexp -all b abababbabaaaaaaaaaab
+} {6}
+test regexpComp-18.3 {regexp -all -inline} {
+ regexp -all -inline b abababbabaaaaaaaaaab
+} {b b b b b b}
+test regexpComp-18.4 {regexp -all -inline} {
+ regexp -all -inline {\w(\w)} abcdefg
+} {ab b cd d ef f}
+test regexpComp-18.5 {regexp -all -inline} {
+ regexp -all -inline {\w(\w)$} abcdefg
+} {fg g}
+test regexpComp-18.6 {regexp -all -inline} {
+ regexp -all -inline {\d+} 10:20:30:40
+} {10 20 30 40}
+test regexpComp-18.7 {regexp -all -inline} {
+ list [catch {regexp -all -inline b abc match} msg] $msg
+} {1 {regexp match variables not allowed when using -inline}}
+test regexpComp-18.8 {regexp -all} {
+ # This should not cause an infinite loop
+ regexp -all -inline {a*} a
+} {a}
+test regexpComp-18.9 {regexp -all} {
+ # Yes, the expected result is {a {}}. Here's why:
+ # Start at index 0; a* matches the "a" there then stops.
+ # Go to index 1; a* matches the lambda (or {}) there then stops. Recall
+ # that a* matches zero or more "a"'s; thus it matches the string "b", as
+ # there are zero or more "a"'s there.
+ # Go to index 2; this is past the end of the string, so stop.
+ regexp -all -inline {a*} ab
+} {a {}}
+test regexpComp-18.10 {regexp -all} {
+ # Yes, the expected result is {a {} a}. Here's why:
+ # Start at index 0; a* matches the "a" there then stops.
+ # Go to index 1; a* matches the lambda (or {}) there then stops. Recall
+ # that a* matches zero or more "a"'s; thus it matches the string "b", as
+ # there are zero or more "a"'s there.
+ # Go to index 2; a* matches the "a" there then stops.
+ # Go to index 3; this is past the end of the string, so stop.
+ regexp -all -inline {a*} aba
+} {a {} a}
+test regexpComp-18.11 {regexp -all} {
+ evalInProc {
+ regexp -all -inline {^a} aaaa
+ }
+} {a}
+test regexpComp-18.12 {regexp -all -inline -indices} {
+ evalInProc {
+ regexp -all -inline -indices a(b(c)d|e(f)g)h abcdhaefgh
+ }
+} {{0 4} {1 3} {2 2} {-1 -1} {5 9} {6 8} {-1 -1} {7 7}}
+
+test regexpComp-19.1 {regsub null replacement} {
+ evalInProc {
+ regsub -all {@} {@hel@lo@} "\0a\0" result
+ list $result [string length $result]
+ }
+} "\0a\0hel\0a\0lo\0a\0 14"
+
+test regexpComp-20.1 {regsub shared object shimmering} {
+ evalInProc {
+ # Bug #461322
+ set a abcdefghijklmnopqurstuvwxyz
+ set b $a
+ set c abcdefghijklmnopqurstuvwxyz0123456789
+ regsub $a $c $b d
+ list $d [string length $d] [string bytelength $d]
+ }
+} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37]
+#test regexpComp-20.2 {regsub shared object shimmering with -about} {
+# evalInProc {
+# eval regexp -about abc
+# }
+#} {0 {}}
+
+test regexpComp-21.1 {regexp command compiling tests} {
+ evalInProc {
+ regexp foo bar
+ }
+} 0
+test regexpComp-21.2 {regexp command compiling tests} {
+ evalInProc {
+ regexp {^foo$} dogfood
+ }
+} 0
+test regexpComp-21.3 {regexp command compiling tests} {
+ evalInProc {
+ set a foo
+ regexp {^foo$} $a
+ }
+} 1
+test regexpComp-21.4 {regexp command compiling tests} {
+ evalInProc {
+ regexp foo dogfood
+ }
+} 1
+test regexpComp-21.5 {regexp command compiling tests} {
+ evalInProc {
+ regexp -nocase FOO dogfod
+ }
+} 0
+test regexpComp-21.6 {regexp command compiling tests} {
+ evalInProc {
+ regexp -n foo dogfoOd
+ }
+} 1
+test regexpComp-21.7 {regexp command compiling tests} {
+ evalInProc {
+ regexp -no -- FoO dogfood
+ }
+} 1
+test regexpComp-21.8 {regexp command compiling tests} {
+ evalInProc {
+ regexp -- foo dogfod
+ }
+} 0
+test regexpComp-21.9 {regexp command compiling tests} {
+ evalInProc {
+ list [catch {regexp -- -nocase foo dogfod} msg] $msg
+ }
+} {0 0}
+test regexpComp-21.10 {regexp command compiling tests} {
+ evalInProc {
+ list [regsub -all "" foo bar str] $str
+ }
+} {3 barfbarobaro}
+# This useless expression fails. Jim returns "bar"
+#test regexpComp-21.11 {regexp command compiling tests} {
+# evalInProc {
+# list [regsub -all "" "" bar str] $str
+# }
+#} {0 {}}
+
+# We can forgive the underlying regexp engine for not supporting this.
+# Why not use this instead? "((^X)*|\$)"
+#test regexpComp-22.0.1 {Bug 1810038} {
+# evalInProc {
+# regexp ($|^X)* {}
+# }
+#} 1
+
+set i 0
+foreach {str exp result} {
+ foo ^foo 1
+ foobar ^foobar$ 1
+ foobar bar$ 1
+ foobar ^$ 0
+ "" ^$ 1
+ anything $ 1
+ anything ^.*$ 1
+ anything ^.*a$ 0
+ anything ^.*a.*$ 1
+ anything ^.*.*$ 1
+ anything ^.*..*$ 1
+ anything ^.*b$ 0
+ anything ^a.*$ 1
+} {
+ test regexpComp-22.[incr i] {regexp command compiling tests} \
+ [subst {evalInProc {set a "$str"; regexp {$exp} \$a}}] $result
+}
+
+set i 0
+foreach {str exp result} {
+ foo ^foo 1
+ foobar ^foobar$ 1
+ foobar bar$ 1
+ foobar ^$ 0
+ "" ^$ 1
+ anything $ 1
+ anything ^.*$ 1
+ anything ^.*a$ 0
+ anything ^.*a.*$ 1
+ anything ^.*.*$ 1
+ anything ^.*..*$ 1
+ anything ^.*b$ 0
+ anything ^a.*$ 1
+} {
+ test regexpComp-23.[incr i] {regexp command compiling tests INST_REGEXP} \
+ [list regexp $exp $str] $result
+}
+
+test regexpComp-24.1 {regexp command compiling tests} {
+ evalInProc {
+ set re foo
+ regexp -nocase $re bar
+ }
+} 0
+test regexpComp-24.2 {regexp command compiling tests} {
+ evalInProc {
+ set re {^foo$}
+ regexp $re dogfood
+ }
+} 0
+test regexpComp-24.3 {regexp command compiling tests} {
+ evalInProc {
+ set a foo
+ set re {^foo$}
+ regexp $re $a
+ }
+} 1
+test regexpComp-24.4 {regexp command compiling tests} {
+ evalInProc {
+ set re foo
+ regexp $re dogfood
+ }
+} 1
+test regexpComp-24.5 {regexp command compiling tests} {
+ evalInProc {
+ set re FOO
+ regexp -nocase $re dogfod
+ }
+} 0
+test regexpComp-24.6 {regexp command compiling tests} {
+ evalInProc {
+ set re foo
+ regexp -n $re dogfoOd
+ }
+} 1
+test regexpComp-24.7 {regexp command compiling tests} {
+ evalInProc {
+ set re FoO
+ regexp -no -- $re dogfood
+ }
+} 1
+test regexpComp-24.8 {regexp command compiling tests} {
+ evalInProc {
+ set re foo
+ regexp -- $re dogfod
+ }
+} 0
+test regexpComp-24.9 {regexp command compiling tests} {
+ evalInProc {
+ set re "("
+ list [catch {regexp -- $re dogfod} msg] $msg
+ }
+} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
+test regexpComp-24.10 {regexp command compiling tests} {
+ # Bug 1902436 - last * escaped
+ evalInProc {
+ set text {this is *bold* !}
+ set re {\*bold\*}
+ regexp -- $re $text
+ }
+} 1
+test regexpComp-24.11 {regexp command compiling tests} {
+ # Bug 1902436 - last * escaped
+ evalInProc {
+ set text {this is *bold* !}
+ set re {\*bold\*.*!}
+ regexp -- $re $text
+ }
+} 1
+
+test regexp-25.1 {Repeat on escaped char} {
+ regexp {\x41\x42*} bc
+} 0
+
+testreport
diff --git a/tests/utf8.test b/tests/utf8.test
index ba39128..ecb5111 100644
--- a/tests/utf8.test
+++ b/tests/utf8.test
@@ -121,212 +121,7 @@ test utf8-7.1 "string reverse" {
string reverse \ub5Test\u2702
} \u2702tseT\ub5
-# This file contains a collection of tests for tclUtf.c
-# Sourcing this file into Tcl runs the tests and generates output for
-# errors. No output means no errors were found.
-#
-# Copyright (c) 1997 Sun Microsystems, Inc.
-# Copyright (c) 1998-1999 by Scriptics Corporation.
-#
-# See the file "license.terms" for information on usage and redistribution
-# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
-#
-# RCS: @(#) $Id: utf.test,v 1.7 2000/04/10 17:19:05 ericm Exp $
-
-catch {unset x}
-
-section "utf tests"
-
-test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} {
- set x \u01
-} [bytestring "\x01"]
-test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} {
- set x "\u80"
-} [bytestring "\xc2\x80"]
-test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} {
- set x "\ue0"
-} [bytestring "\xc3\xa0"]
-test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} {
- set x "\u4e4e"
-} [bytestring "\xe4\xb9\x8e"]
-
-test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
- string length "abc"
-} {3}
-test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} {
- string length [bytestring "\xC2\xa2"]
-} {1}
-test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
- string length [bytestring "\xE4\xb9\x8e"]
-} {1}
-
-test utf-3.1 {Tcl_UtfCharComplete} {
-} {}
-
-test utf-4.1 {Tcl_NumUtfChars: zero length} {
- string length ""
-} {0}
-test utf-4.2 {Tcl_NumUtfChars: length 1} {
- string length [bytestring "\xC2\xA2"]
-} {1}
-test utf-4.3 {Tcl_NumUtfChars: long string} {
- string length [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
-} {7}
-
-test utf-5.1 {Tcl_UtfFindFirsts} {
-} {}
-
-test utf-6.1 {Tcl_UtfNext} {
-} {}
-
-test utf-7.1 {Tcl_UtfPrev} {
-} {}
-
-test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
- string index abcd 0
-} {a}
-test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
- string index \u4e4e\u25a 0
-} "\u4e4e"
-test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
- string index abcd 2
-} {c}
-test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
- string index \u4e4e\u25a\uff\u543 2
-} "\uff"
-
-test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
- string range abcd 0 2
-} {abc}
-test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
- string range \u4e4e\u25a\uff\u543klmnop 1 5
-} "\u25a\uff\u543kl"
-
-
-test utf-10.1 {Tcl_UtfBackslash: dst == NULL} {
- set x \n
-} {
-}
-test utf-10.2 {Tcl_UtfBackslash: \u subst} {
- set x \ua2
-} [bytestring "\xc2\xa2"]
-test utf-10.3 {Tcl_UtfBackslash: longer \u subst} {
- set x \u4e21
-} [bytestring "\xe4\xb8\xa1"]
-test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} {
- set x \u4e2k
-} "[bytestring \xd3\xa2]k"
-test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} {
- set x \u4e216
-} "[bytestring \xe4\xb8\xa1]6"
-proc bsCheck {char num} {
- global errNum
- test utf-10.$errNum "backslash substitution ($num)" {
- scan $char %c value
- set value
- } $num
- incr errNum
-}
-set errNum 6
-bsCheck \b 8
-bsCheck \e 101
-bsCheck \f 12
-bsCheck \n 10
-bsCheck \r 13
-bsCheck \t 9
-bsCheck \v 11
-bsCheck \{ 123
-bsCheck \} 125
-bsCheck \[ 91
-bsCheck \] 93
-bsCheck \$ 36
-bsCheck \ 32
-bsCheck \; 59
-bsCheck \\ 92
-bsCheck \Ca 67
-bsCheck \Ma 77
-bsCheck \CMa 67
-# prior to 8.3, this returned 8, as \8 as accepted as an
-# octal value - but it isn't! [Bug: 3975]
-bsCheck \8a 56
-bsCheck \14 12
-bsCheck \141 97
-bsCheck b\0 98
-bsCheck \x 120
-bsCheck \xa 10
-bsCheck \xA 10
-bsCheck \x41 65
-#bsCheck \x541 65
-bsCheck \u 117
-bsCheck \uk 117
-bsCheck \u41 65
-bsCheck \ua 10
-bsCheck \uA 10
-bsCheck \ue0 224
-bsCheck \ua1 161
-bsCheck \u4e21 20001
-
-test utf-11.1 {Tcl_UtfToUpper} {
- string toupper {}
-} {}
-test utf-11.2 {Tcl_UtfToUpper} {
- string toupper abc
-} ABC
-test utf-11.3 {Tcl_UtfToUpper} {
- string toupper \u00e3ab
-} \u00c3AB
-test utf-11.4 {Tcl_UtfToUpper} {
- string toupper \u01e3ab
-} \u01e2AB
-
-test utf-12.1 {Tcl_UtfToLower} {
- string tolower {}
-} {}
-test utf-12.2 {Tcl_UtfToLower} {
- string tolower ABC
-} abc
-test utf-12.3 {Tcl_UtfToLower} {
- string tolower \u00c3AB
-} \u00e3ab
-test utf-12.4 {Tcl_UtfToLower} {
- string tolower \u01e2AB
-} \u01e3ab
-
-
-test utf-14.1 {Tcl_UtfNcasecmp} {
- string compare -nocase a b
-} -1
-test utf-14.2 {Tcl_UtfNcasecmp} {
- string compare -nocase b a
-} 1
-test utf-14.3 {Tcl_UtfNcasecmp} {
- string compare -nocase B a
-} 1
-test utf-14.4 {Tcl_UtfNcasecmp} {
- string compare -nocase aBcB abca
-} 1
-
-test utf-15.1 {Tcl_UniCharToUpper, negative delta} {
- string toupper aA
-} AA
-test utf-15.2 {Tcl_UniCharToUpper, positive delta} {
- string toupper \u0178\u00ff
-} \u0178\u0178
-test utf-15.3 {Tcl_UniCharToUpper, no delta} {
- string toupper !
-} !
-
-test utf-16.1 {Tcl_UniCharToLower, negative delta} {
- string tolower aA
-} aa
-test utf-16.2 {Tcl_UniCharToLower, positive delta} {
- string tolower \u0178\u00ff
-} \u00ff\u00ff
-test utf-17.1 {Tcl_UniCharToLower, no delta} {
- string tolower !
-} !
-
-test utf-18.1 {append counts correctly} {
+test utf8-7.2 {append counts correctly} {
set x \u2702XYZ
append x \u2702XYZ
list [string length $x] [string bytelength $x]
diff --git a/tests/utftcl.test b/tests/utftcl.test
new file mode 100644
index 0000000..468cff1
--- /dev/null
+++ b/tests/utftcl.test
@@ -0,0 +1,290 @@
+# This file contains a collection of tests for tclUtf.c
+# Sourcing this file into Tcl runs the tests and generates output for
+# errors. No output means no errors were found.
+#
+# Copyright (c) 1997 Sun Microsystems, Inc.
+# Copyright (c) 1998-1999 by Scriptics Corporation.
+#
+# See the file "license.terms" for information on usage and redistribution
+# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+#
+# RCS: @(#) $Id: utf.test,v 1.14 2007/05/02 01:37:28 kennykb Exp $
+
+source testing.tcl
+
+ifutf8 {
+
+catch {unset x}
+
+test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} {
+ set x \x01
+} [bytestring "\x01"]
+test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} {
+ set x "\u80"
+} [bytestring "\xc2\x80"]
+test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} {
+ set x "\ue0"
+} [bytestring "\xc3\xa0"]
+test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} {
+ set x "\u4e4e"
+} [bytestring "\xe4\xb9\x8e"]
+test utf-1.5 {Tcl_UniCharToUtf: negative Tcl_UniChar} {
+ string length [format %c -1]
+} 1
+
+test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
+ string length "abc"
+} {3}
+test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} {
+ string length [bytestring "\x82\x83\x84"]
+} {3}
+test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} {
+ string length [bytestring "\xC2"]
+} {1}
+test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} {
+ string length [bytestring "\xC2\xa2"]
+} {1}
+test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} {
+ string length [bytestring "\xE2"]
+} {1}
+test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} {
+ string length [bytestring "\xE2\xA2"]
+} {2}
+test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
+ string length [bytestring "\xE4\xb9\x8e"]
+} {1}
+test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} {
+ string length [bytestring "\xF4\xA2\xA2\xA2"]
+} {4}
+
+test utf-3.1 {Tcl_UtfCharComplete} {
+} {}
+
+proc testnumutfchars {a {n ""}} {
+ string length $a
+}
+
+test utf-4.1 {Tcl_NumUtfChars: zero length} {
+ testnumutfchars ""
+} {0}
+test utf-4.2 {Tcl_NumUtfChars: length 1} {
+ testnumutfchars [bytestring "\xC2\xA2"]
+} {1}
+test utf-4.3 {Tcl_NumUtfChars: long string} {
+ testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
+} {7}
+test utf-4.4 {Tcl_NumUtfChars: #u0000} {
+ testnumutfchars [bytestring "\xC0\x80"]
+} {1}
+test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} {
+ testnumutfchars "" 1
+} {0}
+test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {
+ testnumutfchars [bytestring "\xC2\xA2"] 1
+} {1}
+test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {
+ testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1
+} {7}
+test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {
+ testnumutfchars [bytestring "\xC0\x80"] 1
+} {1}
+
+test utf-5.1 {Tcl_UtfFindFirsts} {
+} {}
+
+test utf-6.1 {Tcl_UtfNext} {
+} {}
+
+test utf-7.1 {Tcl_UtfPrev} {
+} {}
+
+test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
+ string index abcd 0
+} {a}
+test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
+ string index \u4e4e\u25a 0
+} "\u4e4e"
+test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
+ string index abcd 2
+} {c}
+test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
+ string index \u4e4e\u25a\uff\u543 2
+} "\uff"
+
+test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
+ string range abcd 0 2
+} {abc}
+test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
+ string range \u4e4e\u25a\xff\u543klmnop 1 5
+} "\u25a\xff\u543kl"
+
+
+test utf-10.1 {Tcl_UtfBackslash: dst == NULL} {
+ set x \n
+} {
+}
+test utf-10.2 {Tcl_UtfBackslash: \u subst} {
+ set x \ua2
+} [bytestring "\xc2\xa2"]
+test utf-10.3 {Tcl_UtfBackslash: longer \u subst} {
+ set x \u4e21
+} [bytestring "\xe4\xb8\xa1"]
+test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} {
+ set x \u4e2k
+} "[bytestring \xd3\xa2]k"
+test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} {
+ set x \u4e216
+} "[bytestring \xe4\xb8\xa1]6"
+proc bsCheck {char num} {
+ global errNum
+ test utf-10.$errNum {backslash substitution} {
+ scan $char %c value
+ set value
+ } $num
+ incr errNum
+}
+set errNum 6
+bsCheck \b 8
+bsCheck \e 101
+bsCheck \f 12
+bsCheck \n 10
+bsCheck \r 13
+bsCheck \t 9
+bsCheck \v 11
+bsCheck \{ 123
+bsCheck \} 125
+bsCheck \[ 91
+bsCheck \] 93
+bsCheck \$ 36
+bsCheck \ 32
+bsCheck \; 59
+bsCheck \\ 92
+bsCheck \Ca 67
+bsCheck \Ma 77
+bsCheck \CMa 67
+# prior to 8.3, this returned 8, as \8 as accepted as an
+# octal value - but it isn't! [Bug: 3975]
+bsCheck \8a 56
+bsCheck \14 12
+bsCheck \141 97
+bsCheck b\0 98
+bsCheck \x 120
+bsCheck \ua 10
+bsCheck \uA 10
+bsCheck \u41 65
+bsCheck \u 117
+bsCheck \uk 117
+bsCheck \u41 65
+bsCheck \ua 10
+bsCheck \uA 10
+bsCheck \340 224
+bsCheck \ua1 161
+bsCheck \u4e21 20001
+
+test utf-11.1 {Tcl_UtfToUpper} {
+ string toupper {}
+} {}
+test utf-11.2 {Tcl_UtfToUpper} {
+ string toupper abc
+} ABC
+test utf-11.3 {Tcl_UtfToUpper} {
+ string toupper \u00e3ab
+} \u00c3AB
+test utf-11.4 {Tcl_UtfToUpper} {
+ string toupper \u01e3ab
+} \u01e2AB
+
+test utf-12.1 {Tcl_UtfToLower} {
+ string tolower {}
+} {}
+test utf-12.2 {Tcl_UtfToLower} {
+ string tolower ABC
+} abc
+test utf-12.3 {Tcl_UtfToLower} {
+ string tolower \u00c3AB
+} \u00e3ab
+test utf-12.4 {Tcl_UtfToLower} {
+ string tolower \u01e2AB
+} \u01e3ab
+
+
+test utf-14.1 {Tcl_UtfNcasecmp} {
+ string compare -nocase a b
+} -1
+test utf-14.2 {Tcl_UtfNcasecmp} {
+ string compare -nocase b a
+} 1
+test utf-14.3 {Tcl_UtfNcasecmp} {
+ string compare -nocase B a
+} 1
+test utf-14.4 {Tcl_UtfNcasecmp} {
+ string compare -nocase aBcB abca
+} 1
+
+test utf-15.1 {Tcl_UniCharToUpper, negative delta} {
+ string toupper aA
+} AA
+test utf-15.2 {Tcl_UniCharToUpper, positive delta} {
+ string toupper \u0178\u00ff
+} \u0178\u0178
+test utf-15.3 {Tcl_UniCharToUpper, no delta} {
+ string toupper !
+} !
+
+test utf-16.1 {Tcl_UniCharToLower, negative delta} {
+ string tolower aA
+} aa
+test utf-16.2 {Tcl_UniCharToLower, positive delta} {
+ string tolower \u0178\u00ff
+} \u00ff\u00ff
+test utf-17.1 {Tcl_UniCharToLower, no delta} {
+ string tolower !
+} !
+
+
+#test utf-21.1 {TclUniCharIsAlnum} {
+# # this returns 1 with Unicode 3 compliance
+# string is alnum \u1040\u021f
+#} {1}
+#test utf-21.2 {unicode alnum char in regc_locale.c} {
+# # this returns 1 with Unicode 3 compliance
+# list [regexp {^[[:alnum:]]+$} \u1040\u021f] [regexp {^\w+$} \u1040\u021f]
+#} {1 1}
+
+#test utf-22.1 {TclUniCharIsWordChar} {
+# string wordend "xyz123_bar fg" 0
+#} 10
+#test utf-22.2 {TclUniCharIsWordChar} {
+# string wordend "x\u5080z123_bar\u203c fg" 0
+#} 10
+
+#test utf-23.1 {TclUniCharIsAlpha} {
+# # this returns 1 with Unicode 3 compliance
+# string is alpha \u021f
+#} {1}
+#test utf-23.2 {unicode alpha char in regc_locale.c} {
+# # this returns 1 with Unicode 3 compliance
+# regexp {^[[:alpha:]]+$} \u021f
+#} {1}
+#
+#test utf-24.1 {TclUniCharIsDigit} {
+# # this returns 1 with Unicode 3 compliance
+# string is digit \u1040
+#} {1}
+#test utf-24.2 {unicode digit char in regc_locale.c} {
+# # this returns 1 with Unicode 3 compliance
+# list [regexp {^[[:digit:]]+$} \u1040] [regexp {^\d+$} \u1040]
+#} {1 1}
+#
+#test utf-24.3 {TclUniCharIsSpace} {
+# # this returns 1 with Unicode 3 compliance
+# string is space \u1680
+#} {1}
+#test utf-24.4 {unicode space char in regc_locale.c} {
+# # this returns 1 with Unicode 3 compliance
+# list [regexp {^[[:space:]]+$} \u1680] [regexp {^\s+$} \u1680]
+#} {1 1}
+
+testreport
+
+}