diff options
author | Steve Bennett <steveb@workware.net.au> | 2010-10-17 16:58:08 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2010-11-17 07:57:37 +1000 |
commit | f86ed51e9b0f38954519ca21a623d27bc7c80a88 (patch) | |
tree | f7e098e29695bb778e1a722a8e48ced6fa14ab59 | |
parent | b98537c32b9e481fe8b0653efcfeab950f5a8e87 (diff) | |
download | jimtcl-f86ed51e9b0f38954519ca21a623d27bc7c80a88.zip jimtcl-f86ed51e9b0f38954519ca21a623d27bc7c80a88.tar.gz jimtcl-f86ed51e9b0f38954519ca21a623d27bc7c80a88.tar.bz2 |
Add UTF-8 support to regexp
Plus various ARE enhancements and bug fixes
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rwxr-xr-x | configure | 2 | ||||
-rwxr-xr-x | configure.ac | 3 | ||||
-rw-r--r-- | jimregexp.c | 872 | ||||
-rw-r--r-- | jimregexp.h | 14 | ||||
-rw-r--r-- | jimsh.c | 3 | ||||
-rw-r--r-- | tests/Makefile | 2 | ||||
-rw-r--r-- | tests/regcount.test | 35 | ||||
-rw-r--r-- | tests/regexp2.test | 909 | ||||
-rw-r--r-- | tests/utf8.test | 207 | ||||
-rw-r--r-- | tests/utftcl.test | 290 |
10 files changed, 1871 insertions, 466 deletions
@@ -3645,7 +3645,7 @@ fi if test "${enable_utf8+set}" = set; then : enableval=$enable_utf8; if test "x$enableval" = "xyes" ; then - EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8" + EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8 -DJIM_REGEXP" fi diff --git a/configure.ac b/configure.ac index 7913dce..5c3a27a 100755 --- a/configure.ac +++ b/configure.ac @@ -78,7 +78,8 @@ AC_ARG_ENABLE(utf8, [ --enable-utf8 include support for utf8-encoded strings], [ if test "x$enableval" = "xyes" ; then - EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8" + dnl Note that utf-8 support requires the built-in regexp + EXTRA_CFLAGS="$EXTRA_CFLAGS -DJIM_UTF8 -DJIM_REGEXP" fi ] ) diff --git a/jimregexp.c b/jimregexp.c index 90c8dd7..d6a8723 100644 --- a/jimregexp.c +++ b/jimregexp.c @@ -40,6 +40,9 @@ *** seiwald@perforce.com, on 05 November 2002, to const string literals. *** THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au> *** on 16 October 2010, to remove static state and add better Tcl ARE compatibility. + *** This includes counted repetitions, UTF-8 support, character classes, + *** shorthand character classes, increased number of parentheses to 100, + *** backslash escape sequences. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions. Serious changes in @@ -52,6 +55,7 @@ #include "jim.h" #include "jimregexp.h" +#include "utf8.h" #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP) @@ -98,7 +102,7 @@ * The first byte of the regexp internal "program" is actually this magic * number; the start node begins in the second byte. */ -#define REG_MAGIC 0234 +#define REG_MAGIC 0xFADED00D /* * Opcode notes: @@ -132,9 +136,9 @@ * Using two bytes for the "next" pointer is vast overkill for most things, * but allows patterns to get big without disasters. */ -#define OP(p) ((unsigned char)*(p)) -#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) -#define OPERAND(p) ((p) + 3) +#define OP(p) ((p)[0]) +#define NEXT(p) ((p)[1]) +#define OPERAND(p) ((p) + 2) /* * See regmagic.h for one further detail of program structure. @@ -144,14 +148,11 @@ /* * Utility definitions. */ -#ifndef CHARBITS -#define UCHARAT(p) ((int)*(unsigned char *)(p)) -#else -#define UCHARAT(p) ((int)*(p)&CHARBITS) -#endif +//#define UCHARAT(p) (*(p)) #define FAIL(R,M) { (R)->err = (M); return (M); } -#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{') +#define META "^$.[()|?{+*" /* * Flags to be passed up and down. @@ -164,28 +165,42 @@ /* * Forward declarations for regcomp()'s friends. */ -static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ); -static char *regpiece(regex_t *preg, int *flagp ); -static char *regbranch(regex_t *preg, int *flagp ); -static char *regatom(regex_t *preg, int *flagp ); -static char *regnode(regex_t *preg, int op ); -static const char *regnext(regex_t *preg, const char *p ); +static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ); +static int *regpiece(regex_t *preg, int *flagp ); +static int *regbranch(regex_t *preg, int *flagp ); +static int *regatom(regex_t *preg, int *flagp ); +static int *regnode(regex_t *preg, int op ); +static const int *regnext(regex_t *preg, const int *p ); static void regc(regex_t *preg, int b ); -static void reginsert(regex_t *preg, char op, char *opnd ); -static void regtail(regex_t *preg, char *p, const char *val ); -static void regoptail(regex_t *preg, char *p, const char *val ); +static int *reginsert(regex_t *preg, int op, int *opnd ); +static void regtail(regex_t *preg, int *p, const int *val ); +static void regoptail(regex_t *preg, int *p, const int *val ); -static const char *str_find(const char *string, char c, int nocase); +static int reg_range_find(const int *string, int c, int nocase); +static const char *str_find(const char *string, int c, int nocase); +static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase); /*#define DEBUG*/ #ifdef DEBUG int regnarrate = 0; static void regdump(regex_t *preg); -static const char *regprop( const char *op ); +static const char *regprop( const int *op ); #endif -static char regdummy; +static int regdummy; + +/** + * Returns the length of the null-terminated integer sequence. + */ +static int str_int_len(const int *seq) +{ + int n = 0; + while (*seq++) { + n++; + } + return n; +} /* - regcomp - compile a regular expression into internal code @@ -204,8 +219,8 @@ static char regdummy; */ int regcomp(regex_t *preg, const char *exp, int cflags) { - const char *scan; - const char *longest; + const int *scan; + const int *longest; unsigned len; int flags; @@ -215,12 +230,9 @@ int regcomp(regex_t *preg, const char *exp, int cflags) FAIL(preg, REG_ERR_NULL_ARGUMENT); /* First pass: determine size, legality. */ -#ifdef notdef - if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ -#endif preg->cflags = cflags; - preg->regparse = (char *)exp; - preg->re_nsub = 1; + preg->regparse = exp; + preg->re_nsub = 0; preg->regsize = 0L; preg->regcode = ®dummy; regc(preg, REG_MAGIC); @@ -232,20 +244,21 @@ int regcomp(regex_t *preg, const char *exp, int cflags) FAIL(preg,REG_ERR_TOO_BIG); /* Allocate space. */ - preg->program = malloc(preg->regsize); + preg->program = malloc(preg->regsize * sizeof(*preg->program)); if (preg->program == NULL) FAIL(preg, REG_ERR_NOMEM); /* Second pass: emit code. */ - preg->regparse = (char *)exp; - preg->re_nsub = 1; + preg->regparse = exp; + preg->re_nsub = 0; + preg->regsize = 0L; preg->regcode = preg->program; regc(preg, REG_MAGIC); if (reg(preg, 0, &flags) == NULL) return preg->err; /* Dig out information for optimizations. */ - preg->regstart = '\0'; /* Worst-case defaults. */ + preg->regstart = 0; /* Worst-case defaults. */ preg->reganch = 0; preg->regmust = NULL; preg->regmlen = 0; @@ -270,18 +283,22 @@ int regcomp(regex_t *preg, const char *exp, int cflags) if (flags&SPSTART) { longest = NULL; len = 0; - for (; scan != NULL; scan = regnext(preg, scan)) - if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); + for (; scan != NULL; scan = regnext(preg, scan)) { + if (OP(scan) == EXACTLY) { + int plen = str_int_len(OPERAND(scan)); + if (plen >= len) { + longest = OPERAND(scan); + len = plen; + } } + } preg->regmust = longest; preg->regmlen = len; } } #ifdef DEBUG - /*regdump(preg);*/ + regdump(preg); #endif return 0; @@ -296,11 +313,11 @@ int regcomp(regex_t *preg, const char *exp, int cflags) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) +static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) { - char *ret; - char *br; - const char *ender; + int *ret; + int *br; + const int *ender; int parno = 0; int flags; @@ -308,7 +325,7 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) /* Make an OPEN node, if parenthesized. */ if (paren) { - parno = preg->re_nsub++; + parno = ++preg->re_nsub; ret = regnode(preg, OPEN+parno); } else ret = NULL; @@ -340,7 +357,7 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) regtail(preg, ret, ender); /* Hook the tails of the branches to the closing node. */ - for (br = ret; br != NULL; br = (char *)regnext(preg, br)) + for (br = ret; br != NULL; br = (int *)regnext(preg, br)) regoptail(preg, br, ender); /* Check for proper termination. */ @@ -365,11 +382,11 @@ static char *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) * * Implements the concatenation operator. */ -static char *regbranch(regex_t *preg, int *flagp ) +static int *regbranch(regex_t *preg, int *flagp ) { - char *ret; - char *chain; - char *latest; + int *ret; + int *chain; + int *latest; int flags; *flagp = WORST; /* Tentatively. */ @@ -382,10 +399,12 @@ static char *regbranch(regex_t *preg, int *flagp ) if (latest == NULL) return(NULL); *flagp |= flags&HASWIDTH; - if (chain == NULL) /* First piece. */ + if (chain == NULL) {/* First piece. */ *flagp |= flags&SPSTART; - else + } + else { regtail(preg, chain, latest); + } chain = latest; } if (chain == NULL) /* Loop ran zero times. */ @@ -394,6 +413,31 @@ static char *regbranch(regex_t *preg, int *flagp ) return(ret); } +/** + * Duplicates the program at 'pos' of length 'len' at the end of the program. + * + * If 'maketail' is set, the next point for 'pos' is set to skip to the next + * part of the program after 'pos'. + */ +static int *regdup(regex_t *preg, int *pos, int len, int maketail) +{ + int i; + + preg->regsize += len; + + if (preg->regcode == ®dummy) { + return pos; + } + + for (i = 0; i < len; i++) { + regc(preg, pos[i]); + } + if (maketail) { + regtail(preg, pos, pos + len); + } + return preg->regcode - len; +} + /* - regpiece - something followed by possible [*+?] * @@ -403,17 +447,21 @@ static char *regbranch(regex_t *preg, int *flagp ) * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ -static char *regpiece(regex_t *preg, int *flagp ) +static int *regpiece(regex_t *preg, int *flagp) { - char *ret; + int *ret; char op; - char *next; + int *next; int flags; + int size = preg->regsize; + int *chain = NULL; ret = regatom(preg, &flags); if (ret == NULL) return(NULL); + size = preg->regsize - size; + op = *preg->regparse; if (!ISMULT(op)) { *flagp = flags; @@ -426,6 +474,103 @@ static char *regpiece(regex_t *preg, int *flagp ) } *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); + /* Handle braces (counted repetition) by expansion */ + if (op == '{') { + int min = 0; + int max = 0; + char *end; + + min = strtoul(preg->regparse + 1, &end, 10); + if (end == preg->regparse + 1) { + if (*end == ',') { + min = 0; + } + else { + preg->err = REG_ERR_BAD_COUNT; + return NULL; + } + } + preg->regparse = end; + max = strtoul(preg->regparse + 1, &end, 10); + if (*end != '}') { + preg->err = REG_ERR_UNMATCHED_BRACES; + return NULL; + } + if (end == preg->regparse + 1) { + max = -1; + } + else if (max < min || max >= 100) { + preg->err = REG_ERR_BAD_COUNT; + return NULL; + } + if (min >= 100) { + preg->err = REG_ERR_BAD_COUNT; + return NULL; + } + + preg->regparse = strchr(preg->regparse, '}'); + + /* By default, chain to the start of the sequence */ + chain = ret; + + if (max < 0 || max == min) { + /* Simple case */ + if (max == min) { + if (min == 0) { + /* {0,0} so do nothing at all */ + reginsert(preg, NOTHING, ret); + preg->regparse++; + return ret; + } + /* Output 'min - 1' instances of 'x' */ + min--; + op = 0; + } + else { + /* {n,} is just xxxx* */ + op = '*'; + /* No - chain to the tail of the sequence */ + chain = NULL; + } + + /* We need to duplicate the arg 'min' times */ + while (min--) { + ret = regdup(preg, ret, size, 1); + } + } + else { + /* Complex case */ + int i; + + /* Chaining is needed */ + + /* Need to emit some min args first */ + for (i = 0; i < min; i++) { + ret = regdup(preg, ret, size, 1); + } + + for (i = min; i < max; i++) { + /* Emit x */ + /* There is already one instance of 'reg' at the end */ + /* Add another 'reg' at the end */ + int *prog; + + /* Convert to (x|), just like ? */ + prog = reginsert(preg, BRANCH, ret); /* Either x */ + regtail(preg, ret, regnode(preg, BRANCH)); /* or */ + next = regnode(preg, NOTHING); /* null. */ + regtail(preg, ret, next); + regoptail(preg, ret, next); + + /* Now grab a copy ready for the next iteration */ + if (i != max - 1) { + ret = regdup(preg, prog, size, 0); + } + } + op = 0; + } + } + if (op == '*' && (flags&SIMPLE)) reginsert(preg, STAR, ret); else if (op == '*') { @@ -458,24 +603,127 @@ static char *regpiece(regex_t *preg, int *flagp ) return NULL; } - return(ret); + return chain ? chain : ret; } +/** + * Add all characters in the inclusive range between lower and upper. + * + * Handles a swapped range (upper < lower). + */ static void reg_addrange(regex_t *preg, int lower, int upper) { if (lower > upper) { reg_addrange(preg, upper, lower); } - while (lower <= upper) { - regc(preg, lower++); - } + /* Add a range as length, start */ + regc(preg, upper - lower + 1); + regc(preg, lower); } -static void reg_addstr(regex_t *preg, const char *str) +/** + * Add a null-terminated literal string as a set of ranges. + */ +static void reg_addrange_str(regex_t *preg, const char *str) { while (*str) { - regc(preg, *str++); + reg_addrange(preg, *str, *str); + str++; + } +} + +/** + * Extracts the next unicode char from utf8. + * + * If 'upper' is set, converts the char to uppercase. + */ +static int utf8_tounicode_case(const char *s, int *uc, int upper) +{ + int l = utf8_tounicode(s, uc); + if (upper) { + *uc = utf8_upper(*uc); + } + return l; +} + +/** + * Converts a hex digit to decimal. + * + * Returns -1 for an invalid hex digit. + */ +static int xdigitval(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + return -1; +} + +/** + * Parses up to 'n' hex digits at 's' and stores the result in *uc. + * + * Returns the number of hex digits parsed. + * If there are no hex digits, returns 0 and stores nothing. + */ +static int parse_hex(const char *s, int n, int *uc) +{ + int val = 0; + int k; + + for (k = 0; k < n; k++) { + int c = xdigitval(*s++); + if (c == -1) { + break; + } + val = (val << 4) | c; + } + if (k) { + *uc = val; } + return k; +} + +/** + * Call for chars after a backlash to decode the escape sequence. + * + * Stores the result in *ch. + * + * Returns the number of bytes consumed. + */ +static int reg_decode_escape(const char *s, int *ch) +{ + int n; + const char *s0 = s; + + *ch = *s++; + + switch (*ch) { + case 'b': *ch = '\b'; break; + case 'e': *ch = 27; break; + case 'f': *ch = '\f'; break; + case 'n': *ch = '\n'; break; + case 'r': *ch = '\r'; break; + case 't': *ch = '\t'; break; + case 'v': *ch = '\v'; break; + case 'u': + if ((n = parse_hex(s, 4, ch)) > 0) { + s += n; + } + break; + case 'x': + if ((n = parse_hex(s, 2, ch)) > 0) { + s += n; + } + break; + case '\0': + s--; + *ch = '\\'; + break; + } + return s - s0; } /* @@ -486,14 +734,19 @@ static void reg_addstr(regex_t *preg, const char *str) * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ -static char *regatom(regex_t *preg, int *flagp ) +static int *regatom(regex_t *preg, int *flagp) { - char *ret; + int *ret; int flags; + int nocase = (preg->cflags & REG_ICASE); + + int ch; + int n = utf8_tounicode_case(preg->regparse, &ch, nocase); *flagp = WORST; /* Tentatively. */ - switch (*preg->regparse++) { + preg->regparse += n; + switch (ch) { /* FIXME: these chars only have meaning at beg/end of pat? */ case '^': ret = regnode(preg, BOL); @@ -506,41 +759,82 @@ static char *regatom(regex_t *preg, int *flagp ) *flagp |= HASWIDTH|SIMPLE; break; case '[': { - if (*preg->regparse == '^') { /* Complement of range. */ + const char *pattern = preg->regparse; + + if (*pattern == '^') { /* Complement of range. */ ret = regnode(preg, ANYBUT); - preg->regparse++; + pattern++; } else ret = regnode(preg, ANYOF); - if (*preg->regparse == ']' || *preg->regparse == '-') - regc(preg, *preg->regparse++); - while (*preg->regparse != '\0' && *preg->regparse != ']') { - if (*preg->regparse == '-') { - preg->regparse++; - if (*preg->regparse == ']' || *preg->regparse == '\0') - regc(preg, '-'); - else { - reg_addrange(preg, UCHARAT(preg->regparse-2), UCHARAT(preg->regparse)); - preg->regparse++; + + /* Special case. If the first char is ']' or '-', it is part of the set */ + if (*pattern == ']' || *pattern == '-') { + reg_addrange(preg, *pattern, *pattern); + pattern++; + } + + while (*pattern && *pattern != ']') { + /* Is this a range? a-z */ + int start; + int end; + + pattern += utf8_tounicode_case(pattern, &start, nocase); + if (start == '\\') { + pattern += reg_decode_escape(pattern, &start); + if (start == 0) { + preg->err = REG_ERR_NULL_CHAR; + return NULL; } - } else if (strncmp(preg->regparse, "[:alpha:]", 9) == 0) { - reg_addrange(preg,'a','z'); - reg_addrange(preg,'A','Z'); - preg->regparse += 9; - } else if (strncmp(preg->regparse, "[:alnum:]", 9) == 0) { - reg_addrange(preg,'a','z'); - reg_addrange(preg,'A','Z'); - reg_addrange(preg,'0','9'); - preg->regparse += 9; - } else if (strncmp(preg->regparse, "[:space:]", 9) == 0) { - reg_addstr(preg," \t\r\n\f\v"); - preg->regparse += 9; - } else - regc(preg, *preg->regparse++); + } + if (pattern[0] == '-' && pattern[1]) { + /* skip '-' */ + pattern += utf8_tounicode(pattern, &end); + pattern += utf8_tounicode_case(pattern, &end, nocase); + if (end == '\\') { + pattern += reg_decode_escape(pattern, &end); + if (end == 0) { + preg->err = REG_ERR_NULL_CHAR; + return NULL; + } + } + + reg_addrange(preg, start, end); + continue; + } + if (start == '[') { + if (strncmp(pattern, ":alpha:]", 8) == 0) { + if ((preg->cflags & REG_ICASE) == 0) { + reg_addrange(preg, 'a', 'z'); + } + reg_addrange(preg, 'A', 'Z'); + pattern += 8; + continue; + } + if (strncmp(pattern, ":alnum:]", 8) == 0) { + if ((preg->cflags & REG_ICASE) == 0) { + reg_addrange(preg, 'a', 'z'); + } + reg_addrange(preg, 'A', 'Z'); + reg_addrange(preg, '0', '9'); + pattern += 8; + continue; + } + if (strncmp(pattern, ":space:]", 8) == 0) { + reg_addrange_str(preg, " \t\r\n\f\v"); + pattern += 8; + continue; + } + } + /* Not a range, so just add the char */ + reg_addrange(preg, start, start); } regc(preg, '\0'); - if (*preg->regparse == ']') { - preg->regparse++; + + if (*pattern) { + pattern++; } + preg->regparse = pattern; + *flagp |= HASWIDTH|SIMPLE; } break; @@ -559,10 +853,8 @@ static char *regatom(regex_t *preg, int *flagp ) case '?': case '+': case '*': - preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; - return NULL; case '{': - preg->err = REG_ERR_COUNT_UNSUPPORTED; + preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; return NULL; case '\\': switch (*preg->regparse++) { @@ -571,9 +863,11 @@ static char *regatom(regex_t *preg, int *flagp ) return NULL; break; case '<': + case 'm': ret = regnode(preg, WORDA); break; case '>': + case 'M': ret = regnode(preg, WORDZ); break; case 'd': @@ -584,103 +878,97 @@ static char *regatom(regex_t *preg, int *flagp ) break; case 'w': ret = regnode(preg, ANYOF); - reg_addrange(preg, 'a', 'z'); + if ((preg->cflags & REG_ICASE) == 0) { + reg_addrange(preg, 'a', 'z'); + } reg_addrange(preg, 'A', 'Z'); reg_addrange(preg, '0', '9'); - regc(preg, '_'); + reg_addrange(preg, '_', '_'); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; case 's': ret = regnode(preg, ANYOF); - reg_addstr(preg," \t\r\n\f\v"); + reg_addrange_str(preg," \t\r\n\f\v"); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; /* FIXME: Someday handle \1, \2, ... */ default: /* Handle general quoted chars in exact-match routine */ + /* Back up to include the backslash */ + preg->regparse--; goto de_fault; } break; de_fault: - default: - /* - * Encode a string of characters to be matched exactly. - * - * This is a bit tricky due to quoted chars and due to - * '*', '+', and '?' taking the SINGLE char previous - * as their operand. - * - * On entry, the char at regparse[-1] is going to go - * into the string, no matter what it is. (It could be - * following a \ if we are entered from the '\' case.) - * - * Basic idea is to pick up a good char in ch and - * examine the next char. If it's *+? then we twiddle. - * If it's \ then we frozzle. If it's other magic char - * we push ch and terminate the string. If none of the - * above, we push ch on the string and go around again. - * - * regprev is used to remember where "the current char" - * starts in the string, if due to a *+? we need to back - * up and put the current char in a separate, 1-char, string. - * When regprev is NULL, ch is the only char in the - * string; this is used in *+? handling, and in setting - * flags |= SIMPLE at the end. - */ - { - const char *regprev; - char ch; + default: { + /* + * Encode a string of characters to be matched exactly. + */ + int added = 0; + + /* Back up to pick up the first char of interest */ + preg->regparse -= n; - preg->regparse--; /* Look at cur char */ ret = regnode(preg, EXACTLY); - for ( regprev = 0 ; ; ) { - ch = *preg->regparse++; /* Get current char */ - switch (*preg->regparse) { /* look at next one */ - default: - regc(preg, ch); /* Add cur to string */ - break; + /* Note that a META operator such as ? or * consumes the + * preceding char. + * Thus we must be careful to look ahead by 2 and add the + * last char as it's own EXACTLY if necessary + */ + + /* Until end of string or a META char is reached */ + while (*preg->regparse && strchr(META, *preg->regparse) == NULL) { + n = utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE)); + if (ch == '\\' && preg->regparse[n]) { + /* Non-trailing backslash. + * Is this a special escape, or a regular escape? + */ + if (strchr("<>mMwds", preg->regparse[n])) { + /* A special escape. All done with EXACTLY */ + break; + } + /* Decode it. Note that we add the length for the escape + * sequence to the length for the backlash so we can skip + * the entire sequence, or not as required. + */ + n += reg_decode_escape(preg->regparse + n, &ch); + if (ch == 0) { + preg->err = REG_ERR_NULL_CHAR; + return NULL; + } + } + + /* Now we have one char 'ch' of length 'n'. + * Check to see if the following char is a MULT + */ - case '.': case '[': case '(': - case ')': case '|': case '\n': - case '$': case '^': - case '\0': - /* FIXME, $ and ^ should not always be magic */ - magic: - regc(preg, ch); /* dump cur char */ - goto done; /* and we are done */ - - case '?': case '+': case '*': - if (!regprev) /* If just ch in str, */ - goto magic; /* use it */ - /* End mult-char string one early */ - preg->regparse = regprev; /* Back up parse */ - goto done; - - case '\\': - regc(preg, ch); /* Cur char OK */ - switch (preg->regparse[1]){ /* Look after \ */ - case '\0': - case '<': - case '>': - /* FIXME: Someday handle \1, \2, ... */ - goto done; /* Not quoted */ - default: - /* Backup point is \, scan * point is after it. */ - regprev = preg->regparse; - preg->regparse++; - continue; /* NOT break; */ + if (ISMULT(preg->regparse[n])) { + /* Yes. But do we already have some EXACTLY chars? */ + if (added) { + /* Yes, so return what we have and pick up the current char next time around */ + break; } + /* No, so add this single char and finish */ + regc(preg, ch); + added++; + preg->regparse += n; + break; } - regprev = preg->regparse; /* Set backup point */ + + /* No, so just add this char normally */ + regc(preg, ch); + added++; + preg->regparse += n; } - done: regc(preg, '\0'); + *flagp |= HASWIDTH; - if (!regprev) /* One char? */ + if (added == 1) *flagp |= SIMPLE; + break; } break; } @@ -692,21 +980,20 @@ static char *regatom(regex_t *preg, int *flagp ) - regnode - emit a node */ /* Location. */ -static char *regnode(regex_t *preg, int op) +static int *regnode(regex_t *preg, int op) { - char *ret; - char *ptr; + int *ret; + int *ptr; + preg->regsize += 2; ret = preg->regcode; if (ret == ®dummy) { - preg->regsize += 3; return(ret); } ptr = ret; *ptr++ = op; - *ptr++ = '\0'; /* Null "next" pointer. */ - *ptr++ = '\0'; + *ptr++ = 0; /* Null "next" pointer. */ preg->regcode = ptr; return(ret); @@ -717,47 +1004,49 @@ static char *regnode(regex_t *preg, int op) */ static void regc(regex_t *preg, int b ) { + preg->regsize++; if (preg->regcode != ®dummy) *preg->regcode++ = b; - else - preg->regsize++; } /* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. + * Returns the new location of the original operand. */ -static void reginsert(regex_t *preg, char op, char *opnd ) +static int *reginsert(regex_t *preg, int op, int *opnd ) { - char *src; - char *dst; - char *place; + int *src; + int *dst; + int *place; + + preg->regsize += 2; if (preg->regcode == ®dummy) { - preg->regsize += 3; - return; + return opnd; } src = preg->regcode; - preg->regcode += 3; + preg->regcode += 2; dst = preg->regcode; while (src > opnd) *--dst = *--src; place = opnd; /* Op node, where operand used to be. */ *place++ = op; - *place++ = '\0'; - *place = '\0'; + *place++ = 0; + + return place; } /* - regtail - set the next-pointer at the end of a node chain */ -static void regtail(regex_t *preg, char *p, const char *val ) +static void regtail(regex_t *preg, int *p, const int *val ) { - char *scan; - char *temp; + int *scan; + int *temp; int offset; if (p == ®dummy) @@ -766,7 +1055,7 @@ static void regtail(regex_t *preg, char *p, const char *val ) /* Find last node. */ scan = p; for (;;) { - temp = (char *)regnext(preg, scan); + temp = (int *)regnext(preg, scan); if (temp == NULL) break; scan = temp; @@ -776,15 +1065,15 @@ static void regtail(regex_t *preg, char *p, const char *val ) offset = scan - val; else offset = val - scan; - *(scan+1) = (offset>>8)&0377; - *(scan+2) = offset&0377; + + scan[1] = offset; } /* - regoptail - regtail on operand of first argument; nop if operandless */ -static void regoptail(regex_t *preg, char *p, const char *val ) +static void regoptail(regex_t *preg, int *p, const int *val ) { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || p == ®dummy || OP(p) != BRANCH) @@ -800,8 +1089,8 @@ static void regoptail(regex_t *preg, char *p, const char *val ) * Forwards. */ static int regtry(regex_t *preg, const char *string ); -static int regmatch(regex_t *preg, const char *prog); -static int regrepeat(regex_t *preg, const char *p ); +static int regmatch(regex_t *preg, const int *prog); +static int regrepeat(regex_t *preg, const int *p ); /* - regexec - match a regexp against a string @@ -816,7 +1105,7 @@ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmat } /* Check validity of program. */ - if (UCHARAT(preg->program) != REG_MAGIC) { + if (*preg->program != REG_MAGIC) { return REG_ERR_CORRUPTED; } @@ -833,8 +1122,9 @@ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmat if (preg->regmust != NULL) { s = string; while ((s = str_find(s, preg->regmust[0], preg->cflags & REG_ICASE)) != NULL) { - if (strncmp(s, preg->regmust, preg->regmlen) == 0) - break; /* Found it. */ + if (prefix_cmp(preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) { + break; + } s++; } if (s == NULL) /* Not present. */ @@ -914,48 +1204,86 @@ static int regtry( regex_t *preg, const char *string ) } /** - * Returns 1 if 'pattern' is a prefix of 'string'. + * Returns bytes matched if 'pattern' is a prefix of 'string'. * * If 'nocase' is non-zero, does a case-insensitive match. + * + * Returns -1 on not found. */ -static int prefix_cmp(const char *pattern, const char *string, int nocase) +static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase) { - if (nocase) { - while (*pattern && *string) { - if (toupper(*pattern) != toupper(*string)) { - break; - } - pattern++; - string++; + const char *s = string; + while (proglen && *s) { + int ch; + int n = utf8_tounicode_case(s, &ch, nocase); + if (ch != *prog) { + return -1; } - return *pattern == 0; + prog++; + s += n; + proglen--; } - else { - if (*pattern == *string) { - int len = strlen(pattern); - return strncmp(pattern, string, len) == 0; - } - return 0; + if (proglen == 0) { + return s - string; } + return -1; } -static const char *str_find(const char *string, char c, int nocase) +/** + * Searchs for 'c' in the range 'range'. + * + * If 'nocase' is set, the range is assumed to be uppercase + * and 'c' is converted to uppercase before matching. + * + * Returns 1 if found, or 0 if not. + */ +static int reg_range_find(const int *range, int c, int nocase) { if (nocase) { - c = toupper(c); - while (*string) { - if (toupper(*string) == c) { - return string; - } - string++; + /* The "string" should already be converted to uppercase */ + c = utf8_upper(c); + } + while (*range) { + if (c >= range[1] && c <= (range[0] + range[1] - 1)) { + return 1; } - return NULL; + range += 2; } - else { - return strchr(string, c); + return 0; +} + +/** + * Search for the character 'c' in the utf-8 string 'string'. + * + * If 'nocase' is set, the 'string' is assumed to be uppercase + * and 'c' is converted to uppercase before matching. + * + * Returns the byte position in the string where the 'c' was found, or + * NULL if not found. + */ +static const char *str_find(const char *string, int c, int nocase) +{ + if (nocase) { + /* The "string" should already be converted to uppercase */ + c = utf8_upper(c); } + while (*string) { + int ch; + int n = utf8_tounicode_case(string, &ch, nocase); + if (c == ch) { + return string; + } + string += n; + } + return NULL; } +/** + * Returns true if 'ch' is an end-of-line char. + * + * In REG_NEWLINE mode, \n is considered EOL in + * addition to \0 + */ static int reg_iseol(regex_t *preg, int ch) { if (preg->cflags & REG_NEWLINE) { @@ -977,10 +1305,10 @@ static int reg_iseol(regex_t *preg, int ch) * by recursion. */ /* 0 failure, 1 success */ -static int regmatch(regex_t *preg, const char *prog) +static int regmatch(regex_t *preg, const int *prog) { - const char *scan; /* Current node. */ - const char *next; /* Next node. */ + const int *scan; /* Current node. */ + const int *next; /* Next node. */ scan = prog; #ifdef DEBUG @@ -1025,27 +1353,31 @@ static int regmatch(regex_t *preg, const char *prog) preg->reginput++; break; case EXACTLY: { - const char *opnd; + const int *opnd; + int len; + int slen; opnd = OPERAND(scan); + len = str_int_len(opnd); - if (prefix_cmp(opnd, preg->reginput, preg->cflags & REG_ICASE) == 0) { + slen = prefix_cmp(opnd, len, preg->reginput, preg->cflags & REG_ICASE); + if (slen < 0) { return(0); } - preg->reginput += strlen(opnd); + preg->reginput += slen; } break; case ANYOF: if (reg_iseol(preg, *preg->reginput)) return 0; - if (str_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) == NULL) + if (reg_range_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) == 0) return(0); preg->reginput++; break; case ANYBUT: if (reg_iseol(preg, *preg->reginput)) return 0; - if (str_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) != NULL) + if (reg_range_find(OPERAND(scan), *preg->reginput, preg->cflags & REG_ICASE) != 0) return(0); preg->reginput++; break; @@ -1089,8 +1421,10 @@ static int regmatch(regex_t *preg, const char *prog) save = preg->reginput; no = regrepeat(preg, OPERAND(scan)); while (no >= min) { + int ch; + utf8_tounicode_case(preg->reginput, &ch, (preg->cflags & REG_ICASE)); /* If it could work, try it. */ - if (reg_iseol(preg, nextch) || *preg->reginput == nextch) + if (reg_iseol(preg, nextch) || ch == nextch) if (regmatch(preg, next)) return(1); /* Couldn't or didn't -- back up. */ @@ -1148,11 +1482,11 @@ static int regmatch(regex_t *preg, const char *prog) /* - regrepeat - repeatedly match something simple, report how many */ -static int regrepeat(regex_t *preg, const char *p ) +static int regrepeat(regex_t *preg, const int *p ) { int count = 0; const char *scan; - const char *opnd; + const int *opnd; scan = preg->reginput; opnd = OPERAND(p); @@ -1165,9 +1499,14 @@ static int regrepeat(regex_t *preg, const char *p ) break; case EXACTLY: if (preg->cflags & REG_ICASE) { - while (toupper(*opnd) == toupper(*scan)) { + while (1) { + int ch; + int n = utf8_tounicode_case(scan, &ch, 1); + if (*opnd != ch) { + break; + } count++; - scan++; + scan += n; } } else { @@ -1178,13 +1517,13 @@ static int regrepeat(regex_t *preg, const char *p ) } break; case ANYOF: - while (!reg_iseol(preg, *scan) && str_find(opnd, *scan, preg->cflags & REG_ICASE) != NULL) { + while (!reg_iseol(preg, *scan) && reg_range_find(opnd, *scan, preg->cflags & REG_ICASE) != 0) { count++; scan++; } break; case ANYBUT: - while (!reg_iseol(preg, *scan) && str_find(opnd, *scan, preg->cflags & REG_ICASE) == NULL) { + while (!reg_iseol(preg, *scan) && reg_range_find(opnd, *scan, preg->cflags & REG_ICASE) == 0) { count++; scan++; } @@ -1202,7 +1541,7 @@ static int regrepeat(regex_t *preg, const char *p ) /* - regnext - dig the "next" pointer out of a node */ -static const char *regnext(regex_t *preg, const char *p ) +static const int *regnext(regex_t *preg, const int *p ) { int offset; @@ -1226,25 +1565,45 @@ static const char *regnext(regex_t *preg, const char *p ) */ static void regdump(regex_t *preg) { - const char *s; + const int *s; char op = EXACTLY; /* Arbitrary non-END op. */ - const char *next; + const int *next; + char buf[4]; + if (preg->regcode == ®dummy) + return; s = preg->program + 1; - while (op != END) { /* While that wasn't END last time... */ + while (op != END && s < preg->regcode) { /* While that wasn't END last time... */ op = OP(s); - printf("%2d%s", (int)(s-preg->program), regprop(s)); /* Where, what. */ + printf("%2d{%02x}%s", (int)(s-preg->program), op, regprop(s)); /* Where, what. */ next = regnext(preg, s); if (next == NULL) /* Next ptr. */ printf("(0)"); else printf("(%d)", (int)((s-preg->program)+(next-s))); - s += 3; - if (op == ANYOF || op == ANYBUT || op == EXACTLY) { + s += 2; + if (op == ANYOF || op == ANYBUT) { + /* set of ranges */ + + while (*s) { + int len = *s++; + int first = *s++; + buf[utf8_fromunicode(buf, first)] = 0; + printf("%s", buf); + if (len > 1) { + buf[utf8_fromunicode(buf, first + len - 1)] = 0; + printf("-%s", buf); + } + } + s++; + } + else if (op == EXACTLY) { /* Literal string, where present. */ - while (*s != '\0') { - putchar(*s); + + while (*s) { + buf[utf8_fromunicode(buf, *s)] = 0; + printf("%s", buf); s++; } s++; @@ -1252,20 +1611,29 @@ static void regdump(regex_t *preg) putchar('\n'); } - /* Header fields of interest. */ - if (preg->regstart != '\0') - printf("start `%c' ", preg->regstart); - if (preg->reganch) - printf("anchored "); - if (preg->regmust != NULL) - printf("must have \"%s\"", preg->regmust); + if (op == END) { + /* Header fields of interest. */ + if (preg->regstart != '\0') + buf[utf8_fromunicode(buf, preg->regstart)] = 0; + printf("start '%s' ", buf); + if (preg->reganch) + printf("anchored "); + if (preg->regmust != NULL) { + int i; + printf("must have:"); + for (i = 0; i < preg->regmlen; i++) { + putchar(preg->regmust[i]); + } + putchar('\n'); + } + } printf("\n"); } /* - regprop - printable representation of opcode */ -static const char *regprop( const char *op ) +static const char *regprop( const int *op ) { char *p; static char buf[50]; @@ -1346,6 +1714,8 @@ size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_s "out of memory", "too many ()", "parentheses () not balanced", + "braces {} not balanced", + "invalid repetition count(s)", "extra characters", "*+ of empty atom", "nested count", @@ -1353,7 +1723,7 @@ size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_s "count follows nothing", "trailing backslash", "corrupted program", - "braces {} not supported" + "contains null char", }; const char *err; diff --git a/jimregexp.h b/jimregexp.h index 751bbf7..5d7dc94 100644 --- a/jimregexp.h +++ b/jimregexp.h @@ -50,15 +50,15 @@ typedef struct regexp { /* -- private -- */ int cflags; /* Flags used when compiling */ int err; /* Any error which occurred during compile */ - char regstart; /* Internal use only. */ - char reganch; /* Internal use only. */ - const char *regmust; /* Internal use only. */ + int regstart; /* Internal use only. */ + int reganch; /* Internal use only. */ + const int *regmust; /* Internal use only. */ int regmlen; /* Internal use only. */ - char *program; /* Allocated */ + int *program; /* Allocated */ /* working state - compile */ const char *regparse; /* Input-scan pointer. */ - char *regcode; /* Code-emit pointer; ®dummy = don't. */ + int *regcode; /* Code-emit pointer; ®dummy = don't. */ long regsize; /* Code size. */ /* working state - exec */ @@ -90,6 +90,8 @@ enum { REG_ERR_NOMEM, REG_ERR_TOO_MANY_PAREN, REG_ERR_UNMATCHED_PAREN, + REG_ERR_UNMATCHED_BRACES, + REG_ERR_BAD_COUNT, REG_ERR_JUNK_ON_END, REG_ERR_OPERAND_COULD_BE_EMPTY, REG_ERR_NESTED_COUNT, @@ -97,7 +99,7 @@ enum { REG_ERR_COUNT_FOLLOWS_NOTHING, REG_ERR_TRAILING_BACKSLASH, REG_ERR_CORRUPTED, - REG_ERR_COUNT_UNSUPPORTED, + REG_ERR_NULL_CHAR, REG_ERR_NUM }; @@ -94,6 +94,9 @@ int main(int argc, char *const argv[]) if (argc > 2 && strcmp(argv[1], "-e") == 0) { JimSetArgv(interp, argc - 3, argv + 3); retcode = Jim_Eval(interp, argv[2]); + if (retcode != JIM_ERR) { + printf("%s\n", Jim_GetString(Jim_GetResult(interp), NULL)); + } } else { Jim_SetVariableStr(interp, "argv0", Jim_NewStringObj(interp, argv[1], -1)); diff --git a/tests/Makefile b/tests/Makefile index 285fb9e..02c6c1e 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -9,4 +9,4 @@ test: make -C .. all clean: - rm -f gorp.file2 cat gorp.file sleep exit wc sh echo + rm -f gorp.file2 cat gorp.file sleep exit wc sh echo test1 diff --git a/tests/regcount.test b/tests/regcount.test new file mode 100644 index 0000000..4b4d106 --- /dev/null +++ b/tests/regcount.test @@ -0,0 +1,35 @@ +source testing.tcl + +# Test regexp counted repetitions + +set n 0 +foreach {pat str exp} { + a+ bac 1 + a{1,} bac 1 + a* bac 1 + a{0,} bac 1 + aa+ bac 0 + a{2,} bac 0 + a{2,} bacaad 1 + a{3,} bacaad 0 + a{2,}$ bacaad 0 + a{2,}$ bacaa 1 + a{2,}$ ba 0 + a{2,}$ aa 1 + a{0,0}b$ b 1 + a{1,1}b$ b 0 + a{1,1}b$ cab 1 + a{2,2}b$ cab 0 + a{2,2}b$ cabaabx 0 + a{2,2}b$ cacaab 1 + ca{2,4}b cacaab 1 + ca{2,3}b cacaab 1 + ca{2,3}b cacaaab 1 + c(a|b){2,3}d xcbad 1 + c(a|b){2,3}d xcabbd 1 + c(a|b){2,3}d xcbaaad 0 +} { + test regcount-1.[incr n] "Test: regexp $pat" [list regexp -- $pat $str] $exp +} + +testreport diff --git a/tests/regexp2.test b/tests/regexp2.test new file mode 100644 index 0000000..a061e8b --- /dev/null +++ b/tests/regexp2.test @@ -0,0 +1,909 @@ +# Commands covered: regexp, regsub +# +# This file contains a collection of tests for one or more of the Tcl +# built-in commands. Sourcing this file into Tcl runs the tests and +# generates output for errors. No output means no errors were found. +# +# Copyright (c) 1991-1993 The Regents of the University of California. +# Copyright (c) 1998 Sun Microsystems, Inc. +# Copyright (c) 1998-1999 by Scriptics Corporation. +# +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# RCS: @(#) $Id$ + +source testing.tcl + +catch {package require regexp} +if {[info commands regexp] eq "" || [regexp {\d} 1] == 0} { + # No regexp, or not using a sufficiently capable regexp implementation + puts " --- skipped" + exit 0 +} + + +# Procedure to evaluate a script within a proc, to test compilation +# functionality + +proc evalInProc { script } { + proc testProc {} $script + set status [catch { + testProc + } result] + rename testProc {} + return $result + #return [list $status $result] +} + +catch {unset foo} +test regexpComp-1.1 {basic regexp operation} { + evalInProc { + regexp ab*c abbbc + } +} 1 +test regexpComp-1.2 {basic regexp operation} { + evalInProc { + regexp ab*c ac + } +} 1 +test regexpComp-1.3 {basic regexp operation} { + evalInProc { + regexp ab*c ab + } +} 0 +test regexpComp-1.4 {basic regexp operation} { + evalInProc { + regexp -- -gorp abc-gorpxxx + } +} 1 +test regexpComp-1.5 {basic regexp operation} { + evalInProc { + regexp {^([^ ]*)[ ]*([^ ]*)} "" a + } +} 1 +test regexpComp-1.6 {basic regexp operation} { + list [catch {regexp {} abc} msg] $msg +} {0 1} +test regexpComp-1.7 {regexp utf compliance} { + # if not UTF-8 aware, result is "0 1" + evalInProc { + set foo "\u4e4eb q" + regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar + list [string compare $foo $bar] [regexp 4 $bar] + } +} {0 0} + +test regexpComp-2.1 {getting substrings back from regexp} { + evalInProc { + set foo {} + list [regexp ab*c abbbbc foo] $foo + } +} {1 abbbbc} +test regexpComp-2.2 {getting substrings back from regexp} { + evalInProc { + set foo {} + set f2 {} + list [regexp a(b*)c abbbbc foo f2] $foo $f2 + } +} {1 abbbbc bbbb} +test regexpComp-2.3 {getting substrings back from regexp} { + evalInProc { + set foo {} + set f2 {} + list [regexp a(b*)(c) abbbbc foo f2] $foo $f2 + } +} {1 abbbbc bbbb} +test regexpComp-2.4 {getting substrings back from regexp} { + evalInProc { + set foo {} + set f2 {} + set f3 {} + list [regexp a(b*)(c) abbbbc foo f2 f3] $foo $f2 $f3 + } +} {1 abbbbc bbbb c} +test regexpComp-2.5 {getting substrings back from regexp} { + evalInProc { + set foo {}; set f1 {}; set f2 {}; set f3 {}; set f4 {}; set f5 {}; + set f6 {}; set f7 {}; set f8 {}; set f9 {}; set fa {}; set fb {}; + list [regexp (1*)(2*)(3*)(4*)(5*)(6*)(7*)(8*)(9*)(a*)(b*) \ + 12223345556789999aabbb \ + foo f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb] $foo $f1 $f2 $f3 $f4 $f5 \ + $f6 $f7 $f8 $f9 $fa $fb + } +} {1 12223345556789999aabbb 1 222 33 4 555 6 7 8 9999 aa bbb} +test regexpComp-2.6 {getting substrings back from regexp} { + evalInProc { + set foo 2; set f2 2; set f3 2; set f4 2 + list [regexp (a)(b)? xay foo f2 f3 f4] $foo $f2 $f3 $f4 + } +} {1 a a {} {}} +test regexpComp-2.7 {getting substrings back from regexp} { + evalInProc { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp (a)(b)?(c) xacy foo f2 f3 f4] $foo $f2 $f3 $f4 + } +} {1 ac a {} c} +test regexpComp-2.8 {getting substrings back from regexp} { + evalInProc { + set match {} + list [regexp {^a*b} aaaab match] $match + } +} {1 aaaab} + +test regexpComp-3.1 {-indices option to regexp} { + evalInProc { + set foo {} + list [regexp -indices ab*c abbbbc foo] $foo + } +} {1 {0 5}} +test regexpComp-3.2 {-indices option to regexp} { + evalInProc { + set foo {} + set f2 {} + list [regexp -indices a(b*)c abbbbc foo f2] $foo $f2 + } +} {1 {0 5} {1 4}} +test regexpComp-3.3 {-indices option to regexp} { + evalInProc { + set foo {} + set f2 {} + list [regexp -indices a(b*)(c) abbbbc foo f2] $foo $f2 + } +} {1 {0 5} {1 4}} +test regexpComp-3.4 {-indices option to regexp} { + evalInProc { + set foo {} + set f2 {} + set f3 {} + list [regexp -indices a(b*)(c) abbbbc foo f2 f3] $foo $f2 $f3 + } +} {1 {0 5} {1 4} {5 5}} +test regexpComp-3.5 {-indices option to regexp} { + evalInProc { + set foo {}; set f1 {}; set f2 {}; set f3 {}; set f4 {}; set f5 {}; + set f6 {}; set f7 {}; set f8 {}; set f9 {} + list [regexp -indices (1*)(2*)(3*)(4*)(5*)(6*)(7*)(8*)(9*) \ + 12223345556789999 \ + foo f1 f2 f3 f4 f5 f6 f7 f8 f9] $foo $f1 $f2 $f3 $f4 $f5 \ + $f6 $f7 $f8 $f9 + } +} {1 {0 16} {0 0} {1 3} {4 5} {6 6} {7 9} {10 10} {11 11} {12 12} {13 16}} +test regexpComp-3.6 {getting substrings back from regexp} { + evalInProc { + set foo 2; set f2 2; set f3 2; set f4 2 + list [regexp -indices (a)(b)? xay foo f2 f3 f4] $foo $f2 $f3 $f4 + } +} {1 {1 1} {1 1} {-1 -1} {-1 -1}} +test regexpComp-3.7 {getting substrings back from regexp} { + evalInProc { + set foo 1; set f2 1; set f3 1; set f4 1 + list [regexp -indices (a)(b)?(c) xacy foo f2 f3 f4] $foo $f2 $f3 $f4 + } +} {1 {1 2} {1 1} {-1 -1} {2 2}} + +test regexpComp-4.1 {-nocase option to regexp} { + evalInProc { + regexp -nocase foo abcFOo + } +} 1 +test regexpComp-4.2 {-nocase option to regexp} { + evalInProc { + set f1 22 + set f2 33 + set f3 44 + list [regexp -nocase {a(b*)([xy]*)z} aBbbxYXxxZ22 f1 f2 f3] $f1 $f2 $f3 + } +} {1 aBbbxYXxxZ Bbb xYXxx} +test regexpComp-4.3 {-nocase option to regexp} { + evalInProc { + regexp -nocase FOo abcFOo + } +} 1 +set ::x abcdefghijklmnopqrstuvwxyz1234567890 +set ::x $x$x$x$x$x$x$x$x$x$x$x$x +test regexpComp-4.4 {case conversion in regexp} { + evalInProc { + list [regexp -nocase $::x $::x foo] $foo + } +} "1 $x" +catch {unset ::x} + +test regexpComp-5.1 {exercise cache of compiled expressions} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + regexp .*a bbba + } +} 1 +test regexpComp-5.2 {exercise cache of compiled expressions} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + regexp .*b xxxb + } +} 1 +test regexpComp-5.3 {exercise cache of compiled expressions} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + regexp .*c yyyc + } +} 1 +test regexpComp-5.4 {exercise cache of compiled expressions} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + regexp .*d 1d + } +} 1 +test regexpComp-5.5 {exercise cache of compiled expressions} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + regexp .*e xe + } +} 1 + +test regexpComp-6.4 {regexp errors} { + evalInProc { + list [catch {regexp a( b} msg] $msg + } +} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +test regexpComp-6.5 {regexp errors} { + evalInProc { + list [catch {regexp a( b} msg] $msg + } +} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +test regexpComp-6.6 {regexp errors} { + evalInProc { + list [catch {regexp a a f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1 f1} msg] $msg + } +} {0 1} +test regexpComp-6.7 {regexp errors} { + evalInProc { + list [catch {regexp (x)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) xyzzy} msg] $msg + } +} {0 0} +test regexpComp-6.8 {regexp errors} { + evalInProc { + catch {unset f1} + set f1 44 + catch {regexp abc abc f1(f2)} msg + } +} {1} +test regexpComp-6.9 {regexp errors, -start bad int check} { + evalInProc { + list [catch {regexp -start bogus {^$} {}} msg] $msg + } +} {1 {bad index "bogus": must be integer?[+-]integer? or end?[+-]integer?}} + +test regexpComp-7.1 {basic regsub operation} { + evalInProc { + list [regsub aa+ xaxaaaxaa 111&222 foo] $foo + } +} {1 xax111aaa222xaa} +test regexpComp-7.2 {basic regsub operation} { + evalInProc { + list [regsub aa+ aaaxaa &111 foo] $foo + } +} {1 aaa111xaa} +test regexpComp-7.3 {basic regsub operation} { + evalInProc { + list [regsub aa+ xaxaaa 111& foo] $foo + } +} {1 xax111aaa} +test regexpComp-7.4 {basic regsub operation} { + evalInProc { + list [regsub aa+ aaa 11&2&333 foo] $foo + } +} {1 11aaa2aaa333} +test regexpComp-7.5 {basic regsub operation} { + evalInProc { + list [regsub aa+ xaxaaaxaa &2&333 foo] $foo + } +} {1 xaxaaa2aaa333xaa} +test regexpComp-7.6 {basic regsub operation} { + evalInProc { + list [regsub aa+ xaxaaaxaa 1&22& foo] $foo + } +} {1 xax1aaa22aaaxaa} +test regexpComp-7.7 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {1\122\1} foo] $foo + } +} {1 xax1aa22aaxaa} +test regexpComp-7.8 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {1\\\122\1} foo] $foo + } +} "1 {xax1\\aa22aaxaa}" +test regexpComp-7.9 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {1\\122\1} foo] $foo + } +} "1 {xax1\\122aaxaa}" +test regexpComp-7.10 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {1\\&\1} foo] $foo + } +} "1 {xax1\\aaaaaxaa}" +test regexpComp-7.11 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {1\&\1} foo] $foo + } +} {1 xax1&aaxaa} +test regexpComp-7.12 {basic regsub operation} { + evalInProc { + list [regsub a(a+) xaxaaaxaa {\1\1\1\1&&} foo] $foo + } +} {1 xaxaaaaaaaaaaaaaaxaa} +test regexpComp-7.13 {basic regsub operation} { + evalInProc { + set foo xxx + list [regsub abc xyz 111 foo] $foo + } +} {0 xyz} +test regexpComp-7.14 {basic regsub operation} { + evalInProc { + set foo xxx + list [regsub ^ xyz "111 " foo] $foo + } +} {1 {111 xyz}} +test regexpComp-7.15 {basic regsub operation} { + evalInProc { + set foo xxx + list [regsub -- -foo abc-foodef "111 " foo] $foo + } +} {1 {abc111 def}} +test regexpComp-7.16 {basic regsub operation} { + evalInProc { + set foo xxx + list [regsub x "" y foo] $foo + } +} {0 {}} +test regexpComp-7.17 {regsub utf compliance} { + evalInProc { + # if not UTF-8 aware, result is "0 1" + set foo "xyz555ijka\u4e4ebpqr" + regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar + list [string compare $foo $bar] [regexp 4 $bar] + } +} {0 0} + +test regexpComp-8.1 {case conversion in regsub} { + evalInProc { + list [regsub -nocase a(a+) xaAAaAAay & foo] $foo + } +} {1 xaAAaAAay} +test regexpComp-8.2 {case conversion in regsub} { + evalInProc { + list [regsub -nocase a(a+) xaAAaAAay & foo] $foo + } +} {1 xaAAaAAay} +test regexpComp-8.3 {case conversion in regsub} { + evalInProc { + set foo 123 + list [regsub a(a+) xaAAaAAay & foo] $foo + } +} {0 xaAAaAAay} +test regexpComp-8.4 {case conversion in regsub} { + evalInProc { + set foo 123 + list [regsub -nocase a CaDE b foo] $foo + } +} {1 CbDE} +test regexpComp-8.5 {case conversion in regsub} { + evalInProc { + set foo 123 + list [regsub -nocase XYZ CxYzD b foo] $foo + } +} {1 CbD} +test regexpComp-8.6 {case conversion in regsub} { + evalInProc { + set x abcdefghijklmnopqrstuvwxyz1234567890 + set x $x$x$x$x$x$x$x$x$x$x$x$x + set foo 123 + list [regsub -nocase $x $x b foo] $foo + } +} {1 b} + +test regexpComp-9.1 {-all option to regsub} { + evalInProc { + set foo 86 + list [regsub -all x+ axxxbxxcxdx |&| foo] $foo + } +} {4 a|xxx|b|xx|c|x|d|x|} +test regexpComp-9.2 {-all option to regsub} { + evalInProc { + set foo 86 + list [regsub -nocase -all x+ aXxXbxxcXdx |&| foo] $foo + } +} {4 a|XxX|b|xx|c|X|d|x|} +test regexpComp-9.3 {-all option to regsub} { + evalInProc { + set foo 86 + list [regsub x+ axxxbxxcxdx |&| foo] $foo + } +} {1 a|xxx|bxxcxdx} +test regexpComp-9.4 {-all option to regsub} { + evalInProc { + set foo 86 + list [regsub -all bc axxxbxxcxdx |&| foo] $foo + } +} {0 axxxbxxcxdx} +test regexpComp-9.5 {-all option to regsub} { + evalInProc { + set foo xxx + list [regsub -all node "node node more" yy foo] $foo + } +} {2 {yy yy more}} +test regexpComp-9.6 {-all option to regsub} { + evalInProc { + set foo xxx + list [regsub -all ^ xxx 123 foo] $foo + } +} {1 123xxx} + +#test regexpComp-10.1 {expanded syntax in regsub} { +# evalInProc { +# set foo xxx +# list [regsub -expanded ". \#comment\n . \#comment2" abc def foo] $foo +# } +#} {1 defc} +test regexpComp-10.2 {newline sensitivity in regsub} { + evalInProc { + set foo xxx + list [regsub -line {^a.*b$} "dabc\naxyb\n" 123 foo] $foo + } +} "1 {dabc\n123\n}" +test regexpComp-10.3 {newline sensitivity in regsub} { + evalInProc { + set foo xxx + list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo + } +} "1 {dabc\n123\nxb}" +#test regexpComp-10.4 {partial newline sensitivity in regsub} { +# evalInProc { +# set foo xxx +# list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo +# } +#} "1 {da\n123}" +#test regexpComp-10.5 {inverse partial newline sensitivity in regsub} { +# evalInProc { +# set foo xxx +# list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo +# } +#} "1 {da\nb123\nxb}" + +#test regexpComp-11.1 {regsub errors} { +# evalInProc { +# list [catch {regsub a b} msg] $msg +# } +#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}} +#test regexpComp-11.2 {regsub errors} { +# evalInProc { +# list [catch {regsub -nocase a b} msg] $msg +# } +#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}} +#test regexpComp-11.3 {regsub errors} { +# evalInProc { +# list [catch {regsub -nocase -all a b} msg] $msg +# } +#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}} +#test regexpComp-11.4 {regsub errors} { +# evalInProc { +# list [catch {regsub a b c d e f} msg] $msg +# } +#} {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}} +#test regexpComp-11.5 {regsub errors} { +# evalInProc { +# list [catch {regsub -gorp a b c} msg] $msg +# } +#} {1 {bad switch "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}} +#test regexpComp-11.6 {regsub errors} { +# evalInProc { +# list [catch {regsub -nocase a( b c d} msg] $msg +# } +#} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +test regexpComp-11.7 {regsub errors} { + evalInProc { + catch {unset f1} + set f1 44 + catch {regsub -nocase aaa aaa xxx f1(f2)} msg + } +} {1} +test regexpComp-11.8 {regsub errors, -start bad int check} { + evalInProc { + list [catch {regsub -start bogus pattern string rep var} msg] $msg + } +} {1 {bad index "bogus": must be integer?[+-]integer? or end?[+-]integer?}} + +# This test crashes on the Mac unless you increase the Stack Space to about 1 +# Meg. This is probably bigger than most users want... +# 8.2.3 regexp reduced stack space requirements, but this should be +# tested again +test regexpComp-12.1 {Tcl_RegExpExec: large number of subexpressions} { + evalInProc { + list [regexp (.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) abcdefghijklmnopqrstuvwxyz all a b c d e f g h i j k l m n o p q r s t u v w x y z] $all $a $b $c $d $e $f $g $h $i $j $k $l $m $n $o $p $q $r $s $t $u $v $w $x $y $z + } +} {1 abcdefghijklmnopqrstuvwxyz a b c d e f g h i j k l m n o p q r s t u v w x y z} + +test regexpComp-13.1 {regsub of a very large string} { + # This test is designed to stress the memory subsystem in order + # to catch Bug #933. It only fails if the Tcl memory allocator + # is in use. + + set line {BEGIN_TABLE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; END_TABLE} + set filedata [string repeat $line 200] + for {set i 1} {$i<10} {incr i} { + regsub -all "BEGIN_TABLE " $filedata "" newfiledata + } + set x done +} {done} + +test regexpComp-14.1 {CompileRegexp: regexp cache} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + set x . + append x *a + regexp $x bbba + } +} 1 +test regexpComp-14.2 {CompileRegexp: regexp cache, different flags} { + evalInProc { + regexp .*a b + regexp .*b c + regexp .*c d + regexp .*d e + regexp .*e f + set x . + append x *a + regexp -nocase $x bbba + } +} 1 + +test regexpComp-15.1 {regexp -start} { + catch {unset x} + list [regexp -start -10 {\d} 1abc2de3 x] $x +} {1 1} +test regexpComp-15.2 {regexp -start} { + catch {unset x} + list [regexp -start 2 {\d} 1abc2de3 x] $x +} {1 2} +test regexpComp-15.3 {regexp -start} { + catch {unset x} + list [regexp -start 4 {\d} 1abc2de3 x] $x +} {1 2} +test regexpComp-15.4 {regexp -start} { + catch {unset x} + list [regexp -start 5 {\d} 1abc2de3 x] $x +} {1 3} +test regexpComp-15.5 {regexp -start, over end of string} { + catch {unset x} + list [regexp -start [string length 1abc2de3] {\d} 1abc2de3 x] [info exists x] +} {0 0} +test regexpComp-15.6 {regexp -start, loss of ^$ behavior} { + list [regexp -start 2 {^$} {}] +} {0} + +test regexpComp-16.1 {regsub -start} { + catch {unset x} + list [regsub -all -start 2 {\d} a1b2c3d4e5 {/&} x] $x +} {4 a1b/2c/3d/4e/5} +test regexpComp-16.2 {regsub -start} { + catch {unset x} + list [regsub -all -start -25 {z} hello {/&} x] $x +} {0 hello} +test regexpComp-16.3 {regsub -start} { + catch {unset x} + list [regsub -all -start 3 {z} hello {/&} x] $x +} {0 hello} +#test regexpComp-16.4 {regsub -start, \A behavior} { +# set out {} +# lappend out [regsub -start 0 -all {\A(\w)} {abcde} {/\1} x] $x +# lappend out [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x +#} {5 /a/b/c/d/e 3 ab/c/d/e} + +test regexpComp-17.1 {regexp -inline} { + regexp -inline b ababa +} {b} +test regexpComp-17.2 {regexp -inline} { + regexp -inline (b) ababa +} {b b} +test regexpComp-17.3 {regexp -inline -indices} { + regexp -inline -indices (b) ababa +} {{1 1} {1 1}} +test regexpComp-17.4 {regexp -inline} { + regexp -inline {\w(\d+)\w} " hello 23 there456def " +} {e456d 456} +test regexpComp-17.5 {regexp -inline no matches} { + regexp -inline {\w(\d+)\w} "" +} {} +test regexpComp-17.6 {regexp -inline no matches} { + regexp -inline hello goodbye +} {} +test regexpComp-17.7 {regexp -inline, no matchvars allowed} { + list [catch {regexp -inline b abc match} msg] $msg +} {1 {regexp match variables not allowed when using -inline}} + +test regexpComp-18.1 {regexp -all} { + regexp -all b bbbbb +} {5} +test regexpComp-18.2 {regexp -all} { + regexp -all b abababbabaaaaaaaaaab +} {6} +test regexpComp-18.3 {regexp -all -inline} { + regexp -all -inline b abababbabaaaaaaaaaab +} {b b b b b b} +test regexpComp-18.4 {regexp -all -inline} { + regexp -all -inline {\w(\w)} abcdefg +} {ab b cd d ef f} +test regexpComp-18.5 {regexp -all -inline} { + regexp -all -inline {\w(\w)$} abcdefg +} {fg g} +test regexpComp-18.6 {regexp -all -inline} { + regexp -all -inline {\d+} 10:20:30:40 +} {10 20 30 40} +test regexpComp-18.7 {regexp -all -inline} { + list [catch {regexp -all -inline b abc match} msg] $msg +} {1 {regexp match variables not allowed when using -inline}} +test regexpComp-18.8 {regexp -all} { + # This should not cause an infinite loop + regexp -all -inline {a*} a +} {a} +test regexpComp-18.9 {regexp -all} { + # Yes, the expected result is {a {}}. Here's why: + # Start at index 0; a* matches the "a" there then stops. + # Go to index 1; a* matches the lambda (or {}) there then stops. Recall + # that a* matches zero or more "a"'s; thus it matches the string "b", as + # there are zero or more "a"'s there. + # Go to index 2; this is past the end of the string, so stop. + regexp -all -inline {a*} ab +} {a {}} +test regexpComp-18.10 {regexp -all} { + # Yes, the expected result is {a {} a}. Here's why: + # Start at index 0; a* matches the "a" there then stops. + # Go to index 1; a* matches the lambda (or {}) there then stops. Recall + # that a* matches zero or more "a"'s; thus it matches the string "b", as + # there are zero or more "a"'s there. + # Go to index 2; a* matches the "a" there then stops. + # Go to index 3; this is past the end of the string, so stop. + regexp -all -inline {a*} aba +} {a {} a} +test regexpComp-18.11 {regexp -all} { + evalInProc { + regexp -all -inline {^a} aaaa + } +} {a} +test regexpComp-18.12 {regexp -all -inline -indices} { + evalInProc { + regexp -all -inline -indices a(b(c)d|e(f)g)h abcdhaefgh + } +} {{0 4} {1 3} {2 2} {-1 -1} {5 9} {6 8} {-1 -1} {7 7}} + +test regexpComp-19.1 {regsub null replacement} { + evalInProc { + regsub -all {@} {@hel@lo@} "\0a\0" result + list $result [string length $result] + } +} "\0a\0hel\0a\0lo\0a\0 14" + +test regexpComp-20.1 {regsub shared object shimmering} { + evalInProc { + # Bug #461322 + set a abcdefghijklmnopqurstuvwxyz + set b $a + set c abcdefghijklmnopqurstuvwxyz0123456789 + regsub $a $c $b d + list $d [string length $d] [string bytelength $d] + } +} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37] +#test regexpComp-20.2 {regsub shared object shimmering with -about} { +# evalInProc { +# eval regexp -about abc +# } +#} {0 {}} + +test regexpComp-21.1 {regexp command compiling tests} { + evalInProc { + regexp foo bar + } +} 0 +test regexpComp-21.2 {regexp command compiling tests} { + evalInProc { + regexp {^foo$} dogfood + } +} 0 +test regexpComp-21.3 {regexp command compiling tests} { + evalInProc { + set a foo + regexp {^foo$} $a + } +} 1 +test regexpComp-21.4 {regexp command compiling tests} { + evalInProc { + regexp foo dogfood + } +} 1 +test regexpComp-21.5 {regexp command compiling tests} { + evalInProc { + regexp -nocase FOO dogfod + } +} 0 +test regexpComp-21.6 {regexp command compiling tests} { + evalInProc { + regexp -n foo dogfoOd + } +} 1 +test regexpComp-21.7 {regexp command compiling tests} { + evalInProc { + regexp -no -- FoO dogfood + } +} 1 +test regexpComp-21.8 {regexp command compiling tests} { + evalInProc { + regexp -- foo dogfod + } +} 0 +test regexpComp-21.9 {regexp command compiling tests} { + evalInProc { + list [catch {regexp -- -nocase foo dogfod} msg] $msg + } +} {0 0} +test regexpComp-21.10 {regexp command compiling tests} { + evalInProc { + list [regsub -all "" foo bar str] $str + } +} {3 barfbarobaro} +# This useless expression fails. Jim returns "bar" +#test regexpComp-21.11 {regexp command compiling tests} { +# evalInProc { +# list [regsub -all "" "" bar str] $str +# } +#} {0 {}} + +# We can forgive the underlying regexp engine for not supporting this. +# Why not use this instead? "((^X)*|\$)" +#test regexpComp-22.0.1 {Bug 1810038} { +# evalInProc { +# regexp ($|^X)* {} +# } +#} 1 + +set i 0 +foreach {str exp result} { + foo ^foo 1 + foobar ^foobar$ 1 + foobar bar$ 1 + foobar ^$ 0 + "" ^$ 1 + anything $ 1 + anything ^.*$ 1 + anything ^.*a$ 0 + anything ^.*a.*$ 1 + anything ^.*.*$ 1 + anything ^.*..*$ 1 + anything ^.*b$ 0 + anything ^a.*$ 1 +} { + test regexpComp-22.[incr i] {regexp command compiling tests} \ + [subst {evalInProc {set a "$str"; regexp {$exp} \$a}}] $result +} + +set i 0 +foreach {str exp result} { + foo ^foo 1 + foobar ^foobar$ 1 + foobar bar$ 1 + foobar ^$ 0 + "" ^$ 1 + anything $ 1 + anything ^.*$ 1 + anything ^.*a$ 0 + anything ^.*a.*$ 1 + anything ^.*.*$ 1 + anything ^.*..*$ 1 + anything ^.*b$ 0 + anything ^a.*$ 1 +} { + test regexpComp-23.[incr i] {regexp command compiling tests INST_REGEXP} \ + [list regexp $exp $str] $result +} + +test regexpComp-24.1 {regexp command compiling tests} { + evalInProc { + set re foo + regexp -nocase $re bar + } +} 0 +test regexpComp-24.2 {regexp command compiling tests} { + evalInProc { + set re {^foo$} + regexp $re dogfood + } +} 0 +test regexpComp-24.3 {regexp command compiling tests} { + evalInProc { + set a foo + set re {^foo$} + regexp $re $a + } +} 1 +test regexpComp-24.4 {regexp command compiling tests} { + evalInProc { + set re foo + regexp $re dogfood + } +} 1 +test regexpComp-24.5 {regexp command compiling tests} { + evalInProc { + set re FOO + regexp -nocase $re dogfod + } +} 0 +test regexpComp-24.6 {regexp command compiling tests} { + evalInProc { + set re foo + regexp -n $re dogfoOd + } +} 1 +test regexpComp-24.7 {regexp command compiling tests} { + evalInProc { + set re FoO + regexp -no -- $re dogfood + } +} 1 +test regexpComp-24.8 {regexp command compiling tests} { + evalInProc { + set re foo + regexp -- $re dogfod + } +} 0 +test regexpComp-24.9 {regexp command compiling tests} { + evalInProc { + set re "(" + list [catch {regexp -- $re dogfod} msg] $msg + } +} {1 {couldn't compile regular expression pattern: parentheses () not balanced}} +test regexpComp-24.10 {regexp command compiling tests} { + # Bug 1902436 - last * escaped + evalInProc { + set text {this is *bold* !} + set re {\*bold\*} + regexp -- $re $text + } +} 1 +test regexpComp-24.11 {regexp command compiling tests} { + # Bug 1902436 - last * escaped + evalInProc { + set text {this is *bold* !} + set re {\*bold\*.*!} + regexp -- $re $text + } +} 1 + +test regexp-25.1 {Repeat on escaped char} { + regexp {\x41\x42*} bc +} 0 + +testreport diff --git a/tests/utf8.test b/tests/utf8.test index ba39128..ecb5111 100644 --- a/tests/utf8.test +++ b/tests/utf8.test @@ -121,212 +121,7 @@ test utf8-7.1 "string reverse" { string reverse \ub5Test\u2702 } \u2702tseT\ub5 -# This file contains a collection of tests for tclUtf.c -# Sourcing this file into Tcl runs the tests and generates output for -# errors. No output means no errors were found. -# -# Copyright (c) 1997 Sun Microsystems, Inc. -# Copyright (c) 1998-1999 by Scriptics Corporation. -# -# See the file "license.terms" for information on usage and redistribution -# of this file, and for a DISCLAIMER OF ALL WARRANTIES. -# -# RCS: @(#) $Id: utf.test,v 1.7 2000/04/10 17:19:05 ericm Exp $ - -catch {unset x} - -section "utf tests" - -test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { - set x \u01 -} [bytestring "\x01"] -test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\u80" -} [bytestring "\xc2\x80"] -test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\ue0" -} [bytestring "\xc3\xa0"] -test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { - set x "\u4e4e" -} [bytestring "\xe4\xb9\x8e"] - -test utf-2.1 {Tcl_UtfToUniChar: low ascii} { - string length "abc" -} {3} -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { - string length [bytestring "\xC2\xa2"] -} {1} -test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { - string length [bytestring "\xE4\xb9\x8e"] -} {1} - -test utf-3.1 {Tcl_UtfCharComplete} { -} {} - -test utf-4.1 {Tcl_NumUtfChars: zero length} { - string length "" -} {0} -test utf-4.2 {Tcl_NumUtfChars: length 1} { - string length [bytestring "\xC2\xA2"] -} {1} -test utf-4.3 {Tcl_NumUtfChars: long string} { - string length [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] -} {7} - -test utf-5.1 {Tcl_UtfFindFirsts} { -} {} - -test utf-6.1 {Tcl_UtfNext} { -} {} - -test utf-7.1 {Tcl_UtfPrev} { -} {} - -test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { - string index abcd 0 -} {a} -test utf-8.2 {Tcl_UniCharAtIndex: index = 0} { - string index \u4e4e\u25a 0 -} "\u4e4e" -test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { - string index abcd 2 -} {c} -test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { - string index \u4e4e\u25a\uff\u543 2 -} "\uff" - -test utf-9.1 {Tcl_UtfAtIndex: index = 0} { - string range abcd 0 2 -} {abc} -test utf-9.2 {Tcl_UtfAtIndex: index > 0} { - string range \u4e4e\u25a\uff\u543klmnop 1 5 -} "\u25a\uff\u543kl" - - -test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { - set x \n -} { -} -test utf-10.2 {Tcl_UtfBackslash: \u subst} { - set x \ua2 -} [bytestring "\xc2\xa2"] -test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { - set x \u4e21 -} [bytestring "\xe4\xb8\xa1"] -test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { - set x \u4e2k -} "[bytestring \xd3\xa2]k" -test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { - set x \u4e216 -} "[bytestring \xe4\xb8\xa1]6" -proc bsCheck {char num} { - global errNum - test utf-10.$errNum "backslash substitution ($num)" { - scan $char %c value - set value - } $num - incr errNum -} -set errNum 6 -bsCheck \b 8 -bsCheck \e 101 -bsCheck \f 12 -bsCheck \n 10 -bsCheck \r 13 -bsCheck \t 9 -bsCheck \v 11 -bsCheck \{ 123 -bsCheck \} 125 -bsCheck \[ 91 -bsCheck \] 93 -bsCheck \$ 36 -bsCheck \ 32 -bsCheck \; 59 -bsCheck \\ 92 -bsCheck \Ca 67 -bsCheck \Ma 77 -bsCheck \CMa 67 -# prior to 8.3, this returned 8, as \8 as accepted as an -# octal value - but it isn't! [Bug: 3975] -bsCheck \8a 56 -bsCheck \14 12 -bsCheck \141 97 -bsCheck b\0 98 -bsCheck \x 120 -bsCheck \xa 10 -bsCheck \xA 10 -bsCheck \x41 65 -#bsCheck \x541 65 -bsCheck \u 117 -bsCheck \uk 117 -bsCheck \u41 65 -bsCheck \ua 10 -bsCheck \uA 10 -bsCheck \ue0 224 -bsCheck \ua1 161 -bsCheck \u4e21 20001 - -test utf-11.1 {Tcl_UtfToUpper} { - string toupper {} -} {} -test utf-11.2 {Tcl_UtfToUpper} { - string toupper abc -} ABC -test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB -test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB - -test utf-12.1 {Tcl_UtfToLower} { - string tolower {} -} {} -test utf-12.2 {Tcl_UtfToLower} { - string tolower ABC -} abc -test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab -test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab - - -test utf-14.1 {Tcl_UtfNcasecmp} { - string compare -nocase a b -} -1 -test utf-14.2 {Tcl_UtfNcasecmp} { - string compare -nocase b a -} 1 -test utf-14.3 {Tcl_UtfNcasecmp} { - string compare -nocase B a -} 1 -test utf-14.4 {Tcl_UtfNcasecmp} { - string compare -nocase aBcB abca -} 1 - -test utf-15.1 {Tcl_UniCharToUpper, negative delta} { - string toupper aA -} AA -test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00ff -} \u0178\u0178 -test utf-15.3 {Tcl_UniCharToUpper, no delta} { - string toupper ! -} ! - -test utf-16.1 {Tcl_UniCharToLower, negative delta} { - string tolower aA -} aa -test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff -} \u00ff\u00ff -test utf-17.1 {Tcl_UniCharToLower, no delta} { - string tolower ! -} ! - -test utf-18.1 {append counts correctly} { +test utf8-7.2 {append counts correctly} { set x \u2702XYZ append x \u2702XYZ list [string length $x] [string bytelength $x] diff --git a/tests/utftcl.test b/tests/utftcl.test new file mode 100644 index 0000000..468cff1 --- /dev/null +++ b/tests/utftcl.test @@ -0,0 +1,290 @@ +# This file contains a collection of tests for tclUtf.c +# Sourcing this file into Tcl runs the tests and generates output for +# errors. No output means no errors were found. +# +# Copyright (c) 1997 Sun Microsystems, Inc. +# Copyright (c) 1998-1999 by Scriptics Corporation. +# +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# RCS: @(#) $Id: utf.test,v 1.14 2007/05/02 01:37:28 kennykb Exp $ + +source testing.tcl + +ifutf8 { + +catch {unset x} + +test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { + set x \x01 +} [bytestring "\x01"] +test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { + set x "\u80" +} [bytestring "\xc2\x80"] +test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { + set x "\ue0" +} [bytestring "\xc3\xa0"] +test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { + set x "\u4e4e" +} [bytestring "\xe4\xb9\x8e"] +test utf-1.5 {Tcl_UniCharToUtf: negative Tcl_UniChar} { + string length [format %c -1] +} 1 + +test utf-2.1 {Tcl_UtfToUniChar: low ascii} { + string length "abc" +} {3} +test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} { + string length [bytestring "\x82\x83\x84"] +} {3} +test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} { + string length [bytestring "\xC2"] +} {1} +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { + string length [bytestring "\xC2\xa2"] +} {1} +test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} { + string length [bytestring "\xE2"] +} {1} +test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { + string length [bytestring "\xE2\xA2"] +} {2} +test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { + string length [bytestring "\xE4\xb9\x8e"] +} {1} +test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { + string length [bytestring "\xF4\xA2\xA2\xA2"] +} {4} + +test utf-3.1 {Tcl_UtfCharComplete} { +} {} + +proc testnumutfchars {a {n ""}} { + string length $a +} + +test utf-4.1 {Tcl_NumUtfChars: zero length} { + testnumutfchars "" +} {0} +test utf-4.2 {Tcl_NumUtfChars: length 1} { + testnumutfchars [bytestring "\xC2\xA2"] +} {1} +test utf-4.3 {Tcl_NumUtfChars: long string} { + testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] +} {7} +test utf-4.4 {Tcl_NumUtfChars: #u0000} { + testnumutfchars [bytestring "\xC0\x80"] +} {1} +test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} { + testnumutfchars "" 1 +} {0} +test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} { + testnumutfchars [bytestring "\xC2\xA2"] 1 +} {1} +test utf-4.7 {Tcl_NumUtfChars: long string, calc len} { + testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1 +} {7} +test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} { + testnumutfchars [bytestring "\xC0\x80"] 1 +} {1} + +test utf-5.1 {Tcl_UtfFindFirsts} { +} {} + +test utf-6.1 {Tcl_UtfNext} { +} {} + +test utf-7.1 {Tcl_UtfPrev} { +} {} + +test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { + string index abcd 0 +} {a} +test utf-8.2 {Tcl_UniCharAtIndex: index = 0} { + string index \u4e4e\u25a 0 +} "\u4e4e" +test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { + string index abcd 2 +} {c} +test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { + string index \u4e4e\u25a\uff\u543 2 +} "\uff" + +test utf-9.1 {Tcl_UtfAtIndex: index = 0} { + string range abcd 0 2 +} {abc} +test utf-9.2 {Tcl_UtfAtIndex: index > 0} { + string range \u4e4e\u25a\xff\u543klmnop 1 5 +} "\u25a\xff\u543kl" + + +test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { + set x \n +} { +} +test utf-10.2 {Tcl_UtfBackslash: \u subst} { + set x \ua2 +} [bytestring "\xc2\xa2"] +test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { + set x \u4e21 +} [bytestring "\xe4\xb8\xa1"] +test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { + set x \u4e2k +} "[bytestring \xd3\xa2]k" +test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { + set x \u4e216 +} "[bytestring \xe4\xb8\xa1]6" +proc bsCheck {char num} { + global errNum + test utf-10.$errNum {backslash substitution} { + scan $char %c value + set value + } $num + incr errNum +} +set errNum 6 +bsCheck \b 8 +bsCheck \e 101 +bsCheck \f 12 +bsCheck \n 10 +bsCheck \r 13 +bsCheck \t 9 +bsCheck \v 11 +bsCheck \{ 123 +bsCheck \} 125 +bsCheck \[ 91 +bsCheck \] 93 +bsCheck \$ 36 +bsCheck \ 32 +bsCheck \; 59 +bsCheck \\ 92 +bsCheck \Ca 67 +bsCheck \Ma 77 +bsCheck \CMa 67 +# prior to 8.3, this returned 8, as \8 as accepted as an +# octal value - but it isn't! [Bug: 3975] +bsCheck \8a 56 +bsCheck \14 12 +bsCheck \141 97 +bsCheck b\0 98 +bsCheck \x 120 +bsCheck \ua 10 +bsCheck \uA 10 +bsCheck \u41 65 +bsCheck \u 117 +bsCheck \uk 117 +bsCheck \u41 65 +bsCheck \ua 10 +bsCheck \uA 10 +bsCheck \340 224 +bsCheck \ua1 161 +bsCheck \u4e21 20001 + +test utf-11.1 {Tcl_UtfToUpper} { + string toupper {} +} {} +test utf-11.2 {Tcl_UtfToUpper} { + string toupper abc +} ABC +test utf-11.3 {Tcl_UtfToUpper} { + string toupper \u00e3ab +} \u00c3AB +test utf-11.4 {Tcl_UtfToUpper} { + string toupper \u01e3ab +} \u01e2AB + +test utf-12.1 {Tcl_UtfToLower} { + string tolower {} +} {} +test utf-12.2 {Tcl_UtfToLower} { + string tolower ABC +} abc +test utf-12.3 {Tcl_UtfToLower} { + string tolower \u00c3AB +} \u00e3ab +test utf-12.4 {Tcl_UtfToLower} { + string tolower \u01e2AB +} \u01e3ab + + +test utf-14.1 {Tcl_UtfNcasecmp} { + string compare -nocase a b +} -1 +test utf-14.2 {Tcl_UtfNcasecmp} { + string compare -nocase b a +} 1 +test utf-14.3 {Tcl_UtfNcasecmp} { + string compare -nocase B a +} 1 +test utf-14.4 {Tcl_UtfNcasecmp} { + string compare -nocase aBcB abca +} 1 + +test utf-15.1 {Tcl_UniCharToUpper, negative delta} { + string toupper aA +} AA +test utf-15.2 {Tcl_UniCharToUpper, positive delta} { + string toupper \u0178\u00ff +} \u0178\u0178 +test utf-15.3 {Tcl_UniCharToUpper, no delta} { + string toupper ! +} ! + +test utf-16.1 {Tcl_UniCharToLower, negative delta} { + string tolower aA +} aa +test utf-16.2 {Tcl_UniCharToLower, positive delta} { + string tolower \u0178\u00ff +} \u00ff\u00ff +test utf-17.1 {Tcl_UniCharToLower, no delta} { + string tolower ! +} ! + + +#test utf-21.1 {TclUniCharIsAlnum} { +# # this returns 1 with Unicode 3 compliance +# string is alnum \u1040\u021f +#} {1} +#test utf-21.2 {unicode alnum char in regc_locale.c} { +# # this returns 1 with Unicode 3 compliance +# list [regexp {^[[:alnum:]]+$} \u1040\u021f] [regexp {^\w+$} \u1040\u021f] +#} {1 1} + +#test utf-22.1 {TclUniCharIsWordChar} { +# string wordend "xyz123_bar fg" 0 +#} 10 +#test utf-22.2 {TclUniCharIsWordChar} { +# string wordend "x\u5080z123_bar\u203c fg" 0 +#} 10 + +#test utf-23.1 {TclUniCharIsAlpha} { +# # this returns 1 with Unicode 3 compliance +# string is alpha \u021f +#} {1} +#test utf-23.2 {unicode alpha char in regc_locale.c} { +# # this returns 1 with Unicode 3 compliance +# regexp {^[[:alpha:]]+$} \u021f +#} {1} +# +#test utf-24.1 {TclUniCharIsDigit} { +# # this returns 1 with Unicode 3 compliance +# string is digit \u1040 +#} {1} +#test utf-24.2 {unicode digit char in regc_locale.c} { +# # this returns 1 with Unicode 3 compliance +# list [regexp {^[[:digit:]]+$} \u1040] [regexp {^\d+$} \u1040] +#} {1 1} +# +#test utf-24.3 {TclUniCharIsSpace} { +# # this returns 1 with Unicode 3 compliance +# string is space \u1680 +#} {1} +#test utf-24.4 {unicode space char in regc_locale.c} { +# # this returns 1 with Unicode 3 compliance +# list [regexp {^[[:space:]]+$} \u1680] [regexp {^\s+$} \u1680] +#} {1 1} + +testreport + +} |