aboutsummaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2001-02-02 08:47:28 +0000
committerUlrich Drepper <drepper@redhat.com>2001-02-02 08:47:28 +0000
commite4c785c8d771f50c4c7a059b545976301bd2508c (patch)
treeef058109dcbcefdf4d75c08d6661f1e5874deadc /posix
parent4b1fef8482da6863711fdb64f06413f06424dad7 (diff)
downloadglibc-e4c785c8d771f50c4c7a059b545976301bd2508c.zip
glibc-e4c785c8d771f50c4c7a059b545976301bd2508c.tar.gz
glibc-e4c785c8d771f50c4c7a059b545976301bd2508c.tar.bz2
Update.
* posix/regex.c: Implement multibyte character handling. Patch by Isamu Hasegawa <isamu@yamato.ibm.co.jp>.
Diffstat (limited to 'posix')
-rw-r--r--posix/regex.c2115
1 files changed, 1850 insertions, 265 deletions
diff --git a/posix/regex.c b/posix/regex.c
index 440194a..6a8c899 100644
--- a/posix/regex.c
+++ b/posix/regex.c
@@ -2,7 +2,7 @@
version 0.12.
(Implements POSIX draft P1003.2/D11.2, except for some of the
internationalization features.)
- Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc.
+ Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -56,6 +56,23 @@
# include <wctype.h>
#endif
+/* This is for multi byte string support. */
+#ifdef MBS_SUPPORT
+# define CHAR_TYPE wchar_t
+# define US_CHAR_TYPE wchar_t/* unsigned character type */
+# define COMPILED_BUFFER_VAR wc_buffer
+# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
+# define PUT_CHAR(c) printf ("%C", c) /* Should we use wide stream?? */
+# define TRUE 1
+# define FALSE 0
+#else
+# define CHAR_TYPE char
+# define US_CHAR_TYPE unsigned char /* unsigned character type */
+# define COMPILED_BUFFER_VAR bufp->buffer
+# define OFFSET_ADDRESS_SIZE 2
+# define PUT_CHAR(c) putchar (c)
+#endif /* MBS_SUPPORT */
+
#ifdef _LIBC
/* We have to keep the namespace clean. */
# define regfree(preg) __regfree (preg)
@@ -84,6 +101,7 @@
# include <locale/localeinfo.h>
# include <locale/elem-hash.h>
# include <langinfo.h>
+# include <locale/coll-lookup.h>
#endif
/* This is for other GNU distributions with internationalized messages. */
@@ -414,6 +432,11 @@ typedef enum
/* Followed by one byte giving n, then by n literal bytes. */
exactn,
+#ifdef MBS_SUPPORT
+ /* Same as exactn, but contains binary data. */
+ exactn_bin,
+#endif
+
/* Matches any (more or less) character. */
anychar,
@@ -423,6 +446,13 @@ typedef enum
are ordered low-bit-first. A character is in the set if its
bit is 1. A character too large to have a bit in the map is
automatically not in the set. */
+ /* ifdef MBS_SUPPORT, following element is length of character
+ classes, length of collating symbols, length of equivalence
+ classes, length of character ranges, and length of characters.
+ Next, character class element, collating symbols elements,
+ equivalence class elements, range elements, and character
+ elements follow.
+ See regex_compile function. */
charset,
/* Same parameters as charset, but match any character that is
@@ -472,6 +502,7 @@ typedef enum
/* Followed by two-byte relative address of place to resume at
in case of failure. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
on_failure_jump,
/* Like on_failure_jump, but pushes a placeholder instead of the
@@ -480,6 +511,7 @@ typedef enum
/* Throw away latest failure point and then jump to following
two-byte relative address. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
pop_failure_jump,
/* Change to pop_failure_jump if know won't have to backtrack to
@@ -489,6 +521,7 @@ typedef enum
sure that there is no use backtracking out of repetitions
already matched, then we change it to a pop_failure_jump.
Followed by two-byte address. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
maybe_pop_jump,
/* Jump to following two-byte address, and push a dummy failure
@@ -496,6 +529,7 @@ typedef enum
is made to use it for a failure. A `+' construct makes this
before the first repeat. Also used as an intermediary kind
of jump when compiling an alternative. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
dummy_failure_jump,
/* Push a dummy failure point and continue. Used at the end of
@@ -504,15 +538,18 @@ typedef enum
/* Followed by two-byte relative address and two-byte number n.
After matching N times, jump to the address upon failure. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
succeed_n,
/* Followed by two-byte relative address, and two-byte number n.
Jump to the address N times, then fail. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
jump_n,
/* Set the following two-byte relative address to the
subsequent two-byte number. The address *includes* the two
bytes of number. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
set_number_at,
wordchar, /* Matches any word-constituent character. */
@@ -541,42 +578,63 @@ typedef enum
/* Common operations on the compiled pattern. */
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-#define STORE_NUMBER(destination, number) \
+#ifdef MBS_SUPPORT
+# define STORE_NUMBER(destination, number) \
+ do { \
+ *(destination) = (US_CHAR_TYPE)(number); \
+ } while (0)
+#else
+# define STORE_NUMBER(destination, number) \
do { \
(destination)[0] = (number) & 0377; \
(destination)[1] = (number) >> 8; \
} while (0)
+#endif /* MBS_SUPPORT */
/* Same as STORE_NUMBER, except increment DESTINATION to
the byte after where the number is stored. Therefore, DESTINATION
must be an lvalue. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
#define STORE_NUMBER_AND_INCR(destination, number) \
do { \
STORE_NUMBER (destination, number); \
- (destination) += 2; \
+ (destination) += OFFSET_ADDRESS_SIZE; \
} while (0)
/* Put into DESTINATION a number stored in two contiguous bytes starting
at SOURCE. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-#define EXTRACT_NUMBER(destination, source) \
+#ifdef MBS_SUPPORT
+# define EXTRACT_NUMBER(destination, source) \
+ do { \
+ (destination) = *(source); \
+ } while (0)
+#else
+# define EXTRACT_NUMBER(destination, source) \
do { \
(destination) = *(source) & 0377; \
(destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
} while (0)
+#endif
#ifdef DEBUG
-static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
+static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source));
static void
extract_number (dest, source)
int *dest;
- unsigned char *source;
+ US_CHAR_TYPE *source;
{
+#ifdef MBS_SUPPORT
+ *dest = *source;
+#else
int temp = SIGN_EXTEND_CHAR (*(source + 1));
*dest = *source & 0377;
*dest += temp << 8;
+#endif
}
# ifndef EXTRACT_MACROS /* To debug the macros. */
@@ -592,19 +650,19 @@ extract_number (dest, source)
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
do { \
EXTRACT_NUMBER (destination, source); \
- (source) += 2; \
+ (source) += OFFSET_ADDRESS_SIZE; \
} while (0)
#ifdef DEBUG
static void extract_number_and_incr _RE_ARGS ((int *destination,
- unsigned char **source));
+ US_CHAR_TYPE **source));
static void
extract_number_and_incr (destination, source)
int *destination;
- unsigned char **source;
+ US_CHAR_TYPE **source;
{
extract_number (destination, *source);
- *source += 2;
+ *source += OFFSET_ADDRESS_SIZE;
}
# ifndef EXTRACT_MACROS
@@ -678,13 +736,13 @@ print_fastmap (fastmap)
void
print_partial_compiled_pattern (start, end)
- unsigned char *start;
- unsigned char *end;
+ US_CHAR_TYPE *start;
+ US_CHAR_TYPE *end;
{
int mcnt, mcnt2;
- unsigned char *p1;
- unsigned char *p = start;
- unsigned char *pend = end;
+ US_CHAR_TYPE *p1;
+ US_CHAR_TYPE *p = start;
+ US_CHAR_TYPE *pend = end;
if (start == NULL)
{
@@ -713,11 +771,23 @@ print_partial_compiled_pattern (start, end)
do
{
putchar ('/');
- putchar (*p++);
+ PUT_CHAR (*p++);
}
while (--mcnt);
break;
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+ mcnt = *p++;
+ printf ("/exactn_bin/%d", mcnt);
+ do
+ {
+ printf("/%x", *p++);
+ }
+ while (--mcnt);
+ break;
+#endif /* MBS_SUPPORT */
+
case start_memory:
mcnt = *p++;
printf ("/start_memory/%d/%d", mcnt, *p++);
@@ -739,6 +809,45 @@ print_partial_compiled_pattern (start, end)
case charset:
case charset_not:
{
+#ifdef MBS_SUPPORT
+ int i, length;
+ wchar_t *workp = p;
+ printf ("/charset [%s",
+ (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
+ p += 5;
+ length = *workp++; /* the length of char_classes */
+ for (i=0 ; i<length ; i++)
+ printf("[:%x:]", *p++);
+ length = *workp++; /* the length of collating_symbol */
+ for (i=0 ; i<length ;)
+ {
+ printf("[.");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf(".]");
+ }
+ length = *workp++; /* the length of equivalence_class */
+ for (i=0 ; i<length ;)
+ {
+ printf("[=");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf("=]");
+ }
+ length = *workp++; /* the length of char_range */
+ for (i=0 ; i<length ; i++)
+ {
+ wchar_t range_start = *p++;
+ wchar_t range_end = *p++;
+ printf("%C-%C", range_start, range_end);
+ }
+ length = *workp++; /* the length of char */
+ for (i=0 ; i<length ; i++)
+ printf("%C", *p++);
+ putchar (']');
+#else
register int c, last = -100;
register int in_range = 0;
@@ -776,6 +885,7 @@ print_partial_compiled_pattern (start, end)
putchar (']');
p += 1 + *p;
+#endif /* MBS_SUPPORT */
}
break;
@@ -900,6 +1010,7 @@ print_partial_compiled_pattern (start, end)
case wordend:
printf ("/wordend");
+ break;
# ifdef emacs
case before_dot:
@@ -962,9 +1073,10 @@ void
print_compiled_pattern (bufp)
struct re_pattern_buffer *bufp;
{
- unsigned char *buffer = bufp->buffer;
+ US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer;
- print_partial_compiled_pattern (buffer, buffer + bufp->used);
+ print_partial_compiled_pattern (buffer, buffer
+ + bufp->used / sizeof(US_CHAR_TYPE));
printf ("%ld bytes used/%ld bytes allocated.\n",
bufp->used, bufp->allocated);
@@ -992,9 +1104,9 @@ print_compiled_pattern (bufp)
void
print_double_string (where, string1, size1, string2, size2)
- const char *where;
- const char *string1;
- const char *string2;
+ const CHAR_TYPE *where;
+ const CHAR_TYPE *string1;
+ const CHAR_TYPE *string2;
int size1;
int size2;
{
@@ -1007,13 +1119,13 @@ print_double_string (where, string1, size1, string2, size2)
if (FIRST_STRING_P (where))
{
for (this_char = where - string1; this_char < size1; this_char++)
- putchar (string1[this_char]);
+ PUT_CHAR (string1[this_char]);
where = string2;
}
for (this_char = where - string2; this_char < size2; this_char++)
- putchar (string2[this_char]);
+ PUT_CHAR (string2[this_char]);
}
}
@@ -1039,6 +1151,88 @@ printchar (c)
#endif /* not DEBUG */
+#ifdef MBS_SUPPORT
+/* This convert a multibyte string to a wide character string.
+ And write their correspondances to offset_buffer(see below)
+ and write whether each wchar_t is binary data to is_binary.
+ This assume invalid multibyte sequences as binary data.
+ We assume offset_buffer and is_binary is already allocated
+ enough space. */
+size_t
+convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
+ CHAR_TYPE *dest;
+ const unsigned char* src;
+ size_t len; /* the length of multibyte string. */
+
+ /* It hold correspondances between src(char string) and
+ dest(wchar_t string) for optimization.
+ e.g. src = "xxxyzz"
+ dest = {'X', 'Y', 'Z'}
+ (each "xxx", "y" and "zz" represent one multibyte character
+ corresponding to 'X', 'Y' and 'Z'.)
+ offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
+ = {0, 3, 4, 6}
+ */
+ int *offset_buffer;
+ int *is_binary;
+{
+ wchar_t *pdest = dest;
+ const unsigned char *psrc = src;
+ size_t wc_count = 0;
+
+ if (MB_CUR_MAX == 1)
+ { /* We don't need conversion. */
+ for ( ; wc_count < len ; ++wc_count)
+ {
+ *pdest++ = *psrc++;
+ is_binary[wc_count] = FALSE;
+ offset_buffer[wc_count] = wc_count;
+ }
+ offset_buffer[wc_count] = wc_count;
+ }
+ else
+ {
+ /* We need conversion. */
+ mbstate_t mbs;
+ int consumed;
+ size_t mb_remain = len;
+ size_t mb_count = 0;
+
+ /* Initialize the conversion state. */
+ memset (&mbs, 0, sizeof (mbstate_t));
+
+ offset_buffer[0] = 0;
+ for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
+ psrc += consumed)
+ {
+ consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
+
+ if (consumed <= 0)
+ /* failed to convert. maybe src contains binary data.
+ So we consume 1 byte manualy. */
+ {
+ *pdest = *psrc;
+ consumed = 1;
+ is_binary[wc_count] = TRUE;
+ }
+ else
+ is_binary[wc_count] = FALSE;
+ /* In sjis encoding, we use yen sign as escape character in
+ place of reverse solidus. So we convert 0x5c(yen sign in
+ sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
+ solidus in UCS2). */
+ if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
+ *pdest = (wchar_t) *psrc;
+
+ offset_buffer[wc_count + 1] = mb_count += consumed;
+ }
+ }
+
+ return wc_count;
+}
+
+#endif /* MBS_SUPPORT */
+
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
@@ -1220,7 +1414,7 @@ long int re_max_failures = 2000;
union fail_stack_elt
{
- unsigned char *pointer;
+ US_CHAR_TYPE *pointer;
long int integer;
};
@@ -1245,7 +1439,7 @@ int re_max_failures = 2000;
union fail_stack_elt
{
- unsigned char *pointer;
+ US_CHAR_TYPE *pointer;
int integer;
};
@@ -1327,7 +1521,7 @@ typedef struct
Assumes the variable `fail_stack'. Probably should only
be called from within `PUSH_FAILURE_POINT'. */
#define PUSH_FAILURE_POINTER(item) \
- fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
+ fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item)
/* This pushes an integer-valued item onto the failure stack.
Assumes the variable `fail_stack'. Probably should only
@@ -1484,12 +1678,11 @@ typedef struct
Also assumes the variables `fail_stack' and (if debugging), `bufp',
`pend', `string1', `size1', `string2', and `size2'. */
-
#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
{ \
DEBUG_STATEMENT (unsigned failure_id;) \
active_reg_t this_reg; \
- const unsigned char *string_temp; \
+ const US_CHAR_TYPE *string_temp; \
\
assert (!FAIL_STACK_EMPTY ()); \
\
@@ -1508,13 +1701,13 @@ typedef struct
saved NULL, thus retaining our current position in the string. */ \
string_temp = POP_FAILURE_POINTER (); \
if (string_temp != NULL) \
- str = (const char *) string_temp; \
+ str = (const CHAR_TYPE *) string_temp; \
\
DEBUG_PRINT2 (" Popping string %p: `", str); \
DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
DEBUG_PRINT1 ("'\n"); \
\
- pat = (unsigned char *) POP_FAILURE_POINTER (); \
+ pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
\
@@ -1534,10 +1727,10 @@ typedef struct
DEBUG_PRINT2 (" info: %p\n", \
reg_info[this_reg].word.pointer); \
\
- regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
\
- regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\
DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
} \
else \
@@ -1555,7 +1748,6 @@ typedef struct
DEBUG_STATEMENT (nfailure_points_popped++); \
} /* POP_FAILURE_POINT */
-
/* Structure for per-register (a.k.a. per-group) information.
Other register information, such as the
@@ -1613,7 +1805,7 @@ typedef union
while (0)
/* Registers are set to a sentinel when they haven't yet matched. */
-static char reg_unset_dummy;
+static CHAR_TYPE reg_unset_dummy;
#define REG_UNSET_VALUE (&reg_unset_dummy)
#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
@@ -1622,41 +1814,65 @@ static char reg_unset_dummy;
static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
reg_syntax_t syntax,
struct re_pattern_buffer *bufp));
-static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
-static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
+static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg));
+static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
int arg1, int arg2));
-static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg, unsigned char *end));
-static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg1, int arg2, unsigned char *end));
-static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
+static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
+ int arg, US_CHAR_TYPE *end));
+static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
+ int arg1, int arg2, US_CHAR_TYPE *end));
+static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern,
+ const CHAR_TYPE *p,
reg_syntax_t syntax));
-static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
+static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p,
+ const CHAR_TYPE *pend,
reg_syntax_t syntax));
+#ifdef MBS_SUPPORT
+static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start,
+ const CHAR_TYPE **p_ptr,
+ const CHAR_TYPE *pend,
+ char *translate,
+ reg_syntax_t syntax,
+ US_CHAR_TYPE *b,
+ CHAR_TYPE *char_set));
+static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end));
+#else
static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
- const char **p_ptr,
- const char *pend,
+ const CHAR_TYPE **p_ptr,
+ const CHAR_TYPE *pend,
char *translate,
reg_syntax_t syntax,
- unsigned char *b));
+ US_CHAR_TYPE *b));
+#endif /* MBS_SUPPORT */
/* Fetch the next character in the uncompiled pattern---translating it
if necessary. Also cast from a signed character in the constant
string passed to us by the user to an unsigned char that we can use
as an array index (in, e.g., `translate'). */
+/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef PATFETCH
-# define PATFETCH(c) \
+# ifdef MBS_SUPPORT
+# define PATFETCH(c) \
+ do {if (p == pend) return REG_EEND; \
+ c = (US_CHAR_TYPE) *p++; \
+ if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \
+ } while (0)
+# else
+# define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
if (translate) c = (unsigned char) translate[c]; \
} while (0)
+# endif /* MBS_SUPPORT */
#endif
/* Fetch the next character in the uncompiled pattern, with no
translation. */
#define PATFETCH_RAW(c) \
do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
+ c = (US_CHAR_TYPE) *p++; \
} while (0)
/* Go backwards one character in the pattern. */
@@ -1667,27 +1883,42 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
cast the subscript to translate because some data is declared as
`char *', to avoid warnings when a string constant is passed. But
when we use a character as a subscript we must make it unsigned. */
+/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef TRANSLATE
-# define TRANSLATE(d) \
+# ifdef MBS_SUPPORT
+# define TRANSLATE(d) \
+ (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d))
+#else
+# define TRANSLATE(d) \
(translate ? (char) translate[(unsigned char) (d)] : (d))
+# endif /* MBS_SUPPORT */
#endif
/* Macros for outputting the compiled pattern into `buffer'. */
/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE 32
+#define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE))
/* Make sure we have at least N more bytes of space in buffer. */
-#define GET_BUFFER_SPACE(n) \
+#ifdef MBS_SUPPORT
+# define GET_BUFFER_SPACE(n) \
+ while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
+ + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \
+ EXTEND_BUFFER ()
+#else
+# define GET_BUFFER_SPACE(n) \
while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
EXTEND_BUFFER ()
+#endif /* MBS_SUPPORT */
/* Make sure we have one more byte of buffer space and then add C to it. */
#define BUF_PUSH(c) \
do { \
GET_BUFFER_SPACE (1); \
- *b++ = (unsigned char) (c); \
+ *b++ = (US_CHAR_TYPE) (c); \
} while (0)
@@ -1695,8 +1926,8 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
#define BUF_PUSH_2(c1, c2) \
do { \
GET_BUFFER_SPACE (2); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
+ *b++ = (US_CHAR_TYPE) (c1); \
+ *b++ = (US_CHAR_TYPE) (c2); \
} while (0)
@@ -1704,28 +1935,28 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
#define BUF_PUSH_3(c1, c2, c3) \
do { \
GET_BUFFER_SPACE (3); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
- *b++ = (unsigned char) (c3); \
+ *b++ = (US_CHAR_TYPE) (c1); \
+ *b++ = (US_CHAR_TYPE) (c2); \
+ *b++ = (US_CHAR_TYPE) (c3); \
} while (0)
-
/* Store a jump with opcode OP at LOC to location TO. We store a
relative address offset by the three bytes the jump itself occupies. */
#define STORE_JUMP(op, loc, to) \
- store_op1 (op, loc, (int) ((to) - (loc) - 3))
+ store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
/* Likewise, for a two-argument jump. */
#define STORE_JUMP2(op, loc, to, arg) \
- store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
+ store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP(op, loc, to) \
- insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
+ insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP2(op, loc, to, arg) \
- insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
+ insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
+ arg, b)
/* This is not an arbitrary limit: the arguments which represent offsets
@@ -1771,21 +2002,58 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
# define MOVE_BUFFER_POINTER(P) (P) += incr
# define ELSE_EXTEND_BUFFER_HIGH_BOUND
#endif
-#define EXTEND_BUFFER() \
+
+#ifdef MBS_SUPPORT
+# define EXTEND_BUFFER() \
+ do { \
+ US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
+ int wchar_count; \
+ if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \
+ return REG_ESIZE; \
+ bufp->allocated <<= 1; \
+ if (bufp->allocated > MAX_BUF_SIZE) \
+ bufp->allocated = MAX_BUF_SIZE; \
+ /* How many characters the new buffer can have? */ \
+ wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \
+ if (wchar_count == 0) wchar_count = 1; \
+ /* Truncate the buffer to CHAR_TYPE align. */ \
+ bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \
+ RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
+ if (COMPILED_BUFFER_VAR == NULL) \
+ return REG_ESPACE; \
+ /* If the buffer moved, move all the pointers into it. */ \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
+ { \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
+ MOVE_BUFFER_POINTER (b); \
+ MOVE_BUFFER_POINTER (begalt); \
+ if (fixup_alt_jump) \
+ MOVE_BUFFER_POINTER (fixup_alt_jump); \
+ if (laststart) \
+ MOVE_BUFFER_POINTER (laststart); \
+ if (pending_exact) \
+ MOVE_BUFFER_POINTER (pending_exact); \
+ } \
+ ELSE_EXTEND_BUFFER_HIGH_BOUND \
+ } while (0)
+#else
+# define EXTEND_BUFFER() \
do { \
- unsigned char *old_buffer = bufp->buffer; \
+ US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
if (bufp->allocated == MAX_BUF_SIZE) \
return REG_ESIZE; \
bufp->allocated <<= 1; \
if (bufp->allocated > MAX_BUF_SIZE) \
bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
- if (bufp->buffer == NULL) \
+ bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \
+ bufp->allocated); \
+ if (COMPILED_BUFFER_VAR == NULL) \
return REG_ESPACE; \
/* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != bufp->buffer) \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
{ \
- int incr = bufp->buffer - old_buffer; \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
MOVE_BUFFER_POINTER (b); \
MOVE_BUFFER_POINTER (begalt); \
if (fixup_alt_jump) \
@@ -1797,7 +2065,7 @@ static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
} \
ELSE_EXTEND_BUFFER_HIGH_BOUND \
} while (0)
-
+#endif /* MBS_SUPPORT */
/* Since we have one byte reserved for the register number argument to
{start,stop}_memory, the maximum number of groups we can report
@@ -1965,33 +2233,61 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
examined nor set. */
/* Return, freeing storage we allocated. */
-#define FREE_STACK_RETURN(value) \
+#ifdef MBS_SUPPORT
+# define FREE_STACK_RETURN(value) \
+ return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
+#else
+# define FREE_STACK_RETURN(value) \
return (free (compile_stack.stack), value)
+#endif /* MBS_SUPPORT */
static reg_errcode_t
+#ifdef MBS_SUPPORT
+regex_compile (cpattern, csize, syntax, bufp)
+ const char *cpattern;
+ size_t csize;
+#else
regex_compile (pattern, size, syntax, bufp)
const char *pattern;
size_t size;
+#endif /* MBS_SUPPORT */
reg_syntax_t syntax;
struct re_pattern_buffer *bufp;
{
/* We fetch characters from PATTERN here. Even though PATTERN is
`char *' (i.e., signed), we declare these variables as unsigned, so
they can be reliably used as array indices. */
- register unsigned char c, c1;
+ register US_CHAR_TYPE c, c1;
+
+#ifdef MBS_SUPPORT
+ /* A temporary space to keep wchar_t pattern and compiled pattern. */
+ CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR;
+ size_t size;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset = NULL;
+ /* It hold whether each wchar_t is binary data or not. */
+ int *is_binary = NULL;
+ /* A flag whether exactn is handling binary data or not. */
+ int is_exactn_bin = FALSE;
+#endif /* MBS_SUPPORT */
/* A random temporary spot in PATTERN. */
- const char *p1;
+ const CHAR_TYPE *p1;
/* Points to the end of the buffer, where we should append. */
- register unsigned char *b;
+ register US_CHAR_TYPE *b;
/* Keeps track of unclosed groups. */
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
- const char *p = pattern;
- const char *pend = pattern + size;
+#ifdef MBS_SUPPORT
+ const CHAR_TYPE *p;
+ const CHAR_TYPE *pend;
+#else
+ const CHAR_TYPE *p = pattern;
+ const CHAR_TYPE *pend = pattern + size;
+#endif /* MBS_SUPPORT */
/* How to translate the characters in the pattern. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -2000,30 +2296,57 @@ regex_compile (pattern, size, syntax, bufp)
command. This makes it possible to tell if a new exact-match
character can be added to that command or if the character requires
a new `exactn' command. */
- unsigned char *pending_exact = 0;
+ US_CHAR_TYPE *pending_exact = 0;
/* Address of start of the most recently finished expression.
This tells, e.g., postfix * where to find the start of its
operand. Reset at the beginning of groups and alternatives. */
- unsigned char *laststart = 0;
+ US_CHAR_TYPE *laststart = 0;
/* Address of beginning of regexp, or inside of last group. */
- unsigned char *begalt;
+ US_CHAR_TYPE *begalt;
/* Place in the uncompiled pattern (i.e., the {) to
which to go back if the interval is invalid. */
+#ifdef MBS_SUPPORT
+ const US_CHAR_TYPE *beg_interval;
+#else
const char *beg_interval;
+#endif /* MBS_SUPPORT */
/* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or' -- except the
last -- ends with a forward jump of this sort. */
- unsigned char *fixup_alt_jump = 0;
+ US_CHAR_TYPE *fixup_alt_jump = 0;
/* Counts open-groups as they are encountered. Remembered for the
matching close-group on the compile stack, so the same register
number is put in the stop_memory as the start_memory. */
regnum_t regnum = 0;
+#ifdef MBS_SUPPORT
+ /* Initialize the wchar_t PATTERN and offset_buffer. */
+ p = pend = pattern = TALLOC(csize, CHAR_TYPE);
+ mbs_offset = TALLOC(csize + 1, int);
+ is_binary = TALLOC(csize + 1, int);
+ if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
+ {
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+ return REG_ESPACE;
+ }
+ size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
+ pend = p + size;
+ if (size < 0)
+ {
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+ return REG_BADPAT;
+ }
+#endif
+
#ifdef DEBUG
DEBUG_PRINT1 ("\nCompiling pattern: ");
if (debug)
@@ -2031,7 +2354,7 @@ regex_compile (pattern, size, syntax, bufp)
unsigned debug_count;
for (debug_count = 0; debug_count < size; debug_count++)
- putchar (pattern[debug_count]);
+ PUT_CHAR (pattern[debug_count]);
putchar ('\n');
}
#endif /* DEBUG */
@@ -2039,7 +2362,14 @@ regex_compile (pattern, size, syntax, bufp)
/* Initialize the compile stack. */
compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
if (compile_stack.stack == NULL)
- return REG_ESPACE;
+ {
+#ifdef MBS_SUPPORT
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+#endif
+ return REG_ESPACE;
+ }
compile_stack.size = INIT_COMPILE_STACK_SIZE;
compile_stack.avail = 0;
@@ -2068,18 +2398,34 @@ regex_compile (pattern, size, syntax, bufp)
{ /* If zero allocated, but buffer is non-null, try to realloc
enough space. This loses if buffer's address is bogus, but
that is the user's responsibility. */
- RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
+#ifdef MBS_SUPPORT
+ /* Free bufp->buffer and allocate an array for wchar_t pattern
+ buffer. */
+ free(bufp->buffer);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE),
+ US_CHAR_TYPE);
+#else
+ RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE);
+#endif /* MBS_SUPPORT */
}
else
{ /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE),
+ US_CHAR_TYPE);
}
- if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
+ if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
+#ifdef MBS_SUPPORT
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR;
+#endif /* MBS_SUPPORT */
bufp->allocated = INIT_BUF_SIZE;
}
+#ifdef MBS_SUPPORT
+ else
+ COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer;
+#endif
- begalt = b = bufp->buffer;
+ begalt = b = COMPILED_BUFFER_VAR;
/* Loop through the uncompiled pattern until we're at the end. */
while (p != pend)
@@ -2204,7 +2550,7 @@ regex_compile (pattern, size, syntax, bufp)
assert (p - 1 > pattern);
/* Allocate the space for the jump. */
- GET_BUFFER_SPACE (3);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
/* We know we are not at the first character of the pattern,
because laststart was nonzero. And we've already
@@ -2221,20 +2567,23 @@ regex_compile (pattern, size, syntax, bufp)
}
else
/* Anything else. */
- STORE_JUMP (maybe_pop_jump, b, laststart - 3);
+ STORE_JUMP (maybe_pop_jump, b, laststart -
+ (1 + OFFSET_ADDRESS_SIZE));
/* We've added more stuff to the buffer. */
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* On failure, jump from laststart to b + 3, which will be the
end of the buffer after this jump is inserted. */
- GET_BUFFER_SPACE (3);
+ /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
+ 'b + 3'. */
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
: on_failure_jump,
- laststart, b + 3);
+ laststart, b + 1 + OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
if (!zero_times_ok)
{
@@ -2243,9 +2592,10 @@ regex_compile (pattern, size, syntax, bufp)
`on_failure_jump' instruction of the loop. This
effects a skip over that instruction the first time
we hit that loop. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (dummy_failure_jump, laststart, laststart +
+ 2 + 2 * OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
}
break;
@@ -2260,10 +2610,467 @@ regex_compile (pattern, size, syntax, bufp)
case '[':
{
boolean had_char_class = false;
+#ifdef MBS_SUPPORT
+ CHAR_TYPE range_start = 0xffffffff;
+#else
unsigned int range_start = 0xffffffff;
-
+#endif
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+#ifdef MBS_SUPPORT
+ /* We assume a charset(_not) structure as a wchar_t array.
+ charset[0] = (re_opcode_t) charset(_not)
+ charset[1] = l (= length of char_classes)
+ charset[2] = m (= length of collating_symbols)
+ charset[3] = n (= length of equivalence_classes)
+ charset[4] = o (= length of char_ranges)
+ charset[5] = p (= length of chars)
+
+ charset[6] = char_class (wctype_t)
+ ...
+ charset[l+5] = char_class (wctype_t)
+
+ charset[l+6] = collating_symbol (wchar_t)
+ ...
+ charset[l+m+5] = collating_symbol (wchar_t)
+ ifdef _LIBC we use the index if
+ _NL_COLLATE_SYMB_EXTRAMB instead of
+ wchar_t string.
+
+ charset[l+m+6] = equivalence_classes (wchar_t)
+ ...
+ charset[l+m+n+5] = equivalence_classes (wchar_t)
+ ifdef _LIBC we use the index in
+ _NL_COLLATE_WEIGHT instead of
+ wchar_t string.
+
+ charset[l+m+n+6] = range_start
+ charset[l+m+n+7] = range_end
+ ...
+ charset[l+m+n+2o+4] = range_start
+ charset[l+m+n+2o+5] = range_end
+ ifdef _LIBC we use the value looked up
+ in _NL_COLLATE_COLLSEQ instead of
+ wchar_t character.
+
+ charset[l+m+n+2o+6] = char
+ ...
+ charset[l+m+n+2o+p+5] = char
+
+ */
+
+ /* We need at least 6 spaces: the opcode, the length of
+ char_classes, the length of collating_symbols, the length of
+ equivalence_classes, the length of char_ranges, the length of
+ chars. */
+ GET_BUFFER_SPACE (6);
+
+ /* Save b as laststart. And We use laststart as the pointer
+ to the first element of the charset here.
+ In other words, laststart[i] indicates charset[i]. */
+ laststart = b;
+
+ /* We test `*p == '^' twice, instead of using an if
+ statement, so we only need one BUF_PUSH. */
+ BUF_PUSH (*p == '^' ? charset_not : charset);
+ if (*p == '^')
+ p++;
+
+ /* Push the length of char_classes, the length of
+ collating_symbols, the length of equivalence_classes, the
+ length of char_ranges and the length of chars. */
+ BUF_PUSH_3 (0, 0, 0);
+ BUF_PUSH_2 (0, 0);
+
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ /* charset_not matches newline according to a syntax bit. */
+ if ((re_opcode_t) b[-6] == charset_not
+ && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
+ {
+ BUF_PUSH('\n');
+ laststart[5]++; /* Update the length of characters */
+ }
+
+ /* Read in characters and ranges, setting map bits. */
+ for (;;)
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ PATFETCH (c);
+
+ /* \ might escape characters inside [...] and [^...]. */
+ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
+
+ PATFETCH (c1);
+ BUF_PUSH(c1);
+ laststart[5]++; /* Update the length of chars */
+ range_start = c1;
+ continue;
+ }
+
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+ if (c == ']' && p != p1 + 1)
+ break;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+ if (had_char_class && c == '-' && *p != ']')
+ FREE_STACK_RETURN (REG_ERANGE);
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+ if (c == '-'
+ && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']')
+ {
+ reg_errcode_t ret;
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = compile_range (range_start, &p, pend, translate,
+ syntax, b, laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+ else if (p[0] == '-' && p[1] != ']')
+ { /* This handles ranges made up of characters only. */
+ reg_errcode_t ret;
+
+ /* Move past the `-'. */
+ PATFETCH (c1);
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = compile_range (c, &p, pend, translate, syntax, b,
+ laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+ { /* Leave room for the null. */
+ char str[CHAR_CLASS_MAX_LENGTH + 1];
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == ':' && *p == ']') || p == pend)
+ break;
+ if (c1 < CHAR_CLASS_MAX_LENGTH)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ /* If isn't a word bracketed by `[:' and `:]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but store them as character). */
+ if (c == ':' && *p == ']')
+ {
+ wctype_t wt;
+ /* Query the character class as wctype_t. */
+ wt = IS_CHAR_CLASS (str);
+ if (wt == 0)
+ FREE_STACK_RETURN (REG_ECTYPE);
+
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH (c);
+
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ /* Allocate the space for character class. */
+ GET_BUFFER_SPACE(1);
+ /* Update the pointer to indicate end of buffer. */
+ b++;
+ /* Move data which follow character classes
+ not to violate the data. */
+ insert_space(1, laststart+6, b-1);
+ /* Store the character class. */
+ laststart[6] = (CHAR_TYPE) wt;
+ laststart[1]++; /* Update length of char_classes */
+
+ had_char_class = true;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (':');
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = ':';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
+ || *p == '.'))
+ {
+ CHAR_TYPE str[128]; /* Should be large enough. */
+ CHAR_TYPE delim = *p; /* '=' or '.' */
+# ifdef _LIBC
+ uint32_t nrules =
+ _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[=' or '[[.'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == delim && *p == ']') || p == pend)
+ break;
+ if (c1 < sizeof (str) - 1)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == delim && *p == ']' && str[0] != '\0')
+ {
+ unsigned int i, offset;
+ /* If we have no collation data we use the default
+ collation in which each character is in a class
+ by itself. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+
+ /* If not defined _LIBC, we push the name and
+ `\0' for the sake of matching performance. */
+ int datasize = c1 + 1;
+
+# ifdef _LIBC
+ int32_t idx = 0;
+ if (nrules == 0)
+# endif
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+ }
+# ifdef _LIBC
+ else
+ {
+ const int32_t *table;
+ const int32_t *weights;
+ const int32_t *extra;
+ const int32_t *indirect;
+ wint_t *cp;
+
+ /* This #include defines a local function! */
+# include <locale/weightwc.h>
+
+ if(delim == '=')
+ {
+ /* We push the index for equivalence class. */
+ cp = (wint_t*)str;
+
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_TABLEWC);
+ weights = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_WEIGHTWC);
+ extra = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_EXTRAWC);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_INDIRECTWC);
+
+ idx = findidx ((const wint_t**)&cp);
+ if (idx == 0 || cp < (wint_t*) str + c1)
+ /* This is no valid character. */
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ str[0] = (wchar_t)idx;
+ }
+ else /* delim == '.' */
+ {
+ /* We push collation sequence value
+ for collating symbol. */
+ int32_t table_size;
+ const int32_t *symb_table;
+ const unsigned char *extra;
+ int32_t idx;
+ int32_t elem;
+ int32_t second;
+ int32_t hash;
+ char char_str[c1];
+
+ /* We have to convert the name to a single-byte
+ string. This is possible since the names
+ consist of ASCII characters and the internal
+ representation is UCS4. */
+ for (i = 0; i < c1; ++i)
+ char_str[i] = str[i];
+
+ table_size =
+ _NL_CURRENT_WORD (LC_COLLATE,
+ _NL_COLLATE_SYMB_HASH_SIZEMB);
+ symb_table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_TABLEMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_EXTRAMB);
+
+ /* Locate the character in the hashing table. */
+ hash = elem_hash (char_str, c1);
+
+ idx = 0;
+ elem = hash % table_size;
+ second = hash % (table_size - 2);
+ while (symb_table[2 * elem] != 0)
+ {
+ /* First compare the hashing value. */
+ if (symb_table[2 * elem] == hash
+ && c1 == extra[symb_table[2 * elem + 1]]
+ && memcmp (str,
+ &extra[symb_table[2 * elem + 1]
+ + 1], c1) == 0)
+ {
+ /* Yep, this is the entry. */
+ idx = symb_table[2 * elem + 1];
+ idx += 1 + extra[idx];
+ break;
+ }
+
+ /* Next entry. */
+ elem += second;
+ }
+
+ if (symb_table[2 * elem] != 0)
+ {
+ /* Compute the index of the byte sequence
+ in the table. */
+ idx += 1 + extra[idx];
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~4;
+
+ str[0] = (wchar_t) &extra[idx + 4];
+ }
+ else if (symb_table[2 * elem] == 0 && c1 == 1)
+ {
+ /* No valid character. Match it as a
+ single byte character. */
+ had_char_class = false;
+ BUF_PUSH(str[0]);
+ /* Update the length of characters */
+ laststart[5]++;
+ range_start = str[0];
+
+ /* Throw away the ] at the end of the
+ collating symbol. */
+ PATFETCH (c);
+ /* exit from the switch block. */
+ continue;
+ }
+ else
+ FREE_STACK_RETURN (REG_ECOLLATE);
+ }
+ datasize = 1;
+ }
+# endif
+ /* Throw away the ] at the end of the equivalence
+ class (or collating symbol). */
+ PATFETCH (c);
+
+ /* Allocate the space for the equivalence class
+ (or collating symbol) (and '\0' if needed). */
+ GET_BUFFER_SPACE(datasize);
+ /* Update the pointer to indicate end of buffer. */
+ b += datasize;
+
+ if (delim == '=')
+ { /* equivalence class */
+ /* Calculate the offset of char_ranges,
+ which is next to equivalence_classes. */
+ offset = laststart[1] + laststart[2]
+ + laststart[3] +6;
+ /* Insert space. */
+ insert_space(datasize, laststart + offset, b - 1);
+
+ /* Write the equivalence_class and \0. */
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* Update the length of equivalence_classes. */
+ laststart[3] += datasize;
+ had_char_class = true;
+ }
+ else /* delim == '.' */
+ { /* collating symbol */
+ /* Calculate the offset of the equivalence_classes,
+ which is next to collating_symbols. */
+ offset = laststart[1] + laststart[2] + 6;
+ /* Insert space and write the collationg_symbol
+ and \0. */
+ insert_space(datasize, laststart + offset, b-1);
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* In re_match_2_internal if range_start < -1, we
+ assume -range_start is the offset of the
+ collating symbol which is specified as
+ the character of the range start. So we assign
+ -(laststart[1] + laststart[2] + 6) to
+ range_start. */
+ range_start = -(laststart[1] + laststart[2] + 6);
+ /* Update the length of collating_symbol. */
+ laststart[2] += datasize;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (delim);
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = delim;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ had_char_class = false;
+ BUF_PUSH(c);
+ laststart[5]++; /* Update the length of characters */
+ range_start = c;
+ }
+ }
+
+#else /* not MBS_SUPPORT */
/* Ensure that we have enough space to push a charset: the
opcode, the length count, and the bitset; 34 bytes in all. */
GET_BUFFER_SPACE (34);
@@ -2378,7 +3185,7 @@ regex_compile (pattern, size, syntax, bufp)
the leading `:' and `[' (but set bits for them). */
if (c == ':' && *p == ']')
{
-#if defined _LIBC || WIDE_CHAR_SUPPORT
+# if defined _LIBC || WIDE_CHAR_SUPPORT
boolean is_lower = STREQ (str, "lower");
boolean is_upper = STREQ (str, "upper");
wctype_t wt;
@@ -2396,13 +3203,13 @@ regex_compile (pattern, size, syntax, bufp)
for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
{
-# ifdef _LIBC
+# ifdef _LIBC
if (__iswctype (__btowc (ch), wt))
SET_LIST_BIT (ch);
-# else
+# else
if (iswctype (btowc (ch), wt))
SET_LIST_BIT (ch);
-# endif
+# endif
if (translate && (is_upper || is_lower)
&& (ISUPPER (ch) || ISLOWER (ch)))
@@ -2410,7 +3217,7 @@ regex_compile (pattern, size, syntax, bufp)
}
had_char_class = true;
-#else
+# else
int ch;
boolean is_alnum = STREQ (str, "alnum");
boolean is_alpha = STREQ (str, "alpha");
@@ -2458,7 +3265,7 @@ regex_compile (pattern, size, syntax, bufp)
SET_LIST_BIT (ch);
}
had_char_class = true;
-#endif /* libc || wctype.h */
+# endif /* libc || wctype.h */
}
else
{
@@ -2474,10 +3281,10 @@ regex_compile (pattern, size, syntax, bufp)
else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
{
unsigned char str[MB_LEN_MAX + 1];
-#ifdef _LIBC
+# ifdef _LIBC
uint32_t nrules =
_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-#endif
+# endif
PATFETCH (c);
c1 = 0;
@@ -2506,9 +3313,9 @@ regex_compile (pattern, size, syntax, bufp)
character set and therefore we cannot have character
with more than one byte in the multibyte
representation. */
-#ifdef _LIBC
+# ifdef _LIBC
if (nrules == 0)
-#endif
+# endif
{
if (c1 != 1)
FREE_STACK_RETURN (REG_ECOLLATE);
@@ -2520,7 +3327,7 @@ regex_compile (pattern, size, syntax, bufp)
/* Set the bit for the character. */
SET_LIST_BIT (str[0]);
}
-#ifdef _LIBC
+# ifdef _LIBC
else
{
/* Try to match the byte sequence in `str' against
@@ -2536,7 +3343,7 @@ regex_compile (pattern, size, syntax, bufp)
int ch;
/* This #include defines a local function! */
-# include <locale/weight.h>
+# include <locale/weight.h>
table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
@@ -2582,7 +3389,7 @@ regex_compile (pattern, size, syntax, bufp)
while (cnt < len
&& (weights[idx + 1 + cnt]
== weights[idx2 + 1 + cnt]))
- ++len;
+ ++cnt;
if (cnt == len)
/* They match. Mark the character as
@@ -2591,7 +3398,7 @@ regex_compile (pattern, size, syntax, bufp)
}
}
}
-#endif
+# endif
had_char_class = true;
}
else
@@ -2608,15 +3415,15 @@ regex_compile (pattern, size, syntax, bufp)
else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
{
unsigned char str[128]; /* Should be large enough. */
-#ifdef _LIBC
+# ifdef _LIBC
uint32_t nrules =
_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-#endif
+# endif
PATFETCH (c);
c1 = 0;
- /* If pattern is `[[='. */
+ /* If pattern is `[[.'. */
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
for (;;)
@@ -2641,9 +3448,9 @@ regex_compile (pattern, size, syntax, bufp)
character set and therefore we cannot have character
with more than one byte in the multibyte
representation. */
-#ifdef _LIBC
+# ifdef _LIBC
if (nrules == 0)
-#endif
+# endif
{
if (c1 != 1)
FREE_STACK_RETURN (REG_ECOLLATE);
@@ -2656,7 +3463,7 @@ regex_compile (pattern, size, syntax, bufp)
SET_LIST_BIT (str[0]);
range_start = ((const unsigned char *) str)[0];
}
-#ifdef _LIBC
+# ifdef _LIBC
else
{
/* Try to match the byte sequence in `str' against
@@ -2734,7 +3541,7 @@ regex_compile (pattern, size, syntax, bufp)
++idx;
}
}
-#endif
+# endif
had_char_class = false;
}
else
@@ -2761,6 +3568,7 @@ regex_compile (pattern, size, syntax, bufp)
while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
b[-1]--;
b += b[-1];
+#endif /* MBS_SUPPORT */
}
break;
@@ -2831,10 +3639,10 @@ regex_compile (pattern, size, syntax, bufp)
group. They are all relative offsets, so that if the
whole pattern moves because of realloc, they will still
be valid. */
- COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
+ COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.fixup_alt_jump
- = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
- COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
+ = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
+ COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.regnum = regnum;
/* We will eventually replace the 0 with the number of
@@ -2843,7 +3651,8 @@ regex_compile (pattern, size, syntax, bufp)
represent in the compiled pattern. */
if (regnum <= MAX_REGNUM)
{
- COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
+ COMPILE_STACK_TOP.inner_group_offset = b
+ - COMPILED_BUFFER_VAR + 2;
BUF_PUSH_3 (start_memory, regnum, 0);
}
@@ -2902,12 +3711,12 @@ regex_compile (pattern, size, syntax, bufp)
regnum_t this_group_regnum;
compile_stack.avail--;
- begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
+ begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
fixup_alt_jump
= COMPILE_STACK_TOP.fixup_alt_jump
- ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
+ ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
: 0;
- laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
+ laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
this_group_regnum = COMPILE_STACK_TOP.regnum;
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
@@ -2918,8 +3727,8 @@ regex_compile (pattern, size, syntax, bufp)
groups were inside this one. */
if (this_group_regnum <= MAX_REGNUM)
{
- unsigned char *inner_group_loc
- = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
+ US_CHAR_TYPE *inner_group_loc
+ = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
*inner_group_loc = regnum - this_group_regnum;
BUF_PUSH_3 (stop_memory, this_group_regnum,
@@ -2938,10 +3747,11 @@ regex_compile (pattern, size, syntax, bufp)
/* Insert before the previous alternative a jump which
jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (on_failure_jump, begalt, b + 6);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (on_failure_jump, begalt,
+ b + 2 + 2 * OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
/* The alternative before this one has a jump after it
which gets executed if it gets matched. Adjust that
@@ -2966,8 +3776,8 @@ regex_compile (pattern, size, syntax, bufp)
to be filled in later either by next alternative or
when know we're at the end of a series of alternatives. */
fixup_alt_jump = b;
- GET_BUFFER_SPACE (3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
laststart = 0;
begalt = b;
@@ -2988,7 +3798,6 @@ regex_compile (pattern, size, syntax, bufp)
/* At least (most) this many matches must be made. */
int lower_bound = -1, upper_bound = -1;
-
beg_interval = p - 1;
if (p == pend)
@@ -3054,12 +3863,15 @@ regex_compile (pattern, size, syntax, bufp)
/* If the upper bound is zero, don't want to succeed at
all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
+ the end of the buffer after we insert the jump. */
+ /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE'
+ instead of 'b + 3'. */
if (upper_bound == 0)
{
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (jump, laststart, b + 3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (jump, laststart, b + 1
+ + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* Otherwise, we have a nontrivial interval. When
@@ -3074,7 +3886,8 @@ regex_compile (pattern, size, syntax, bufp)
else
{ /* If the upper bound is > 1, we need to insert
more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
+ unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
+ (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
GET_BUFFER_SPACE (nbytes);
@@ -3084,16 +3897,21 @@ regex_compile (pattern, size, syntax, bufp)
because `re_compile_fastmap' needs to know.
Jump to the `jump_n' we might insert below. */
INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
- lower_bound);
- b += 5;
+ b + 1 + 2 * OFFSET_ADDRESS_SIZE
+ + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
+ , lower_bound);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* Code to initialize the lower bound. Insert
before the `succeed_n'. The `5' is the last two
bytes of this `set_number_at', plus 3 bytes of
the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
+ /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE'
+ is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
+ of the following `succeed_n'. */
+ insert_op2 (set_number_at, laststart, 1
+ + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
if (upper_bound > 1)
{ /* More than one repetition is allowed, so
@@ -3103,9 +3921,10 @@ regex_compile (pattern, size, syntax, bufp)
When we've reached this during matching,
we'll have matched the interval once, so
jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + 5,
+ STORE_JUMP2 (jump_n, b, laststart
+ + 2 * OFFSET_ADDRESS_SIZE + 1,
upper_bound - 1);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* The location we want to set is the second
parameter of the `jump_n'; that is `b-2' as
@@ -3123,7 +3942,7 @@ regex_compile (pattern, size, syntax, bufp)
reinitialize the bounds. */
insert_op2 (set_number_at, laststart, b - laststart,
upper_bound - 1, b);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
}
}
pending_exact = 0;
@@ -3262,6 +4081,11 @@ regex_compile (pattern, size, syntax, bufp)
normal_char:
/* If no exactn currently being built. */
if (!pending_exact
+#ifdef MBS_SUPPORT
+ /* If last exactn handle binary(or character) and
+ new exactn handle character(or binary). */
+ || is_exactn_bin != is_binary[p - 1 - pattern]
+#endif /* MBS_SUPPORT */
/* If last exactn not at current position. */
|| pending_exact + *pending_exact + 1 != b
@@ -3283,7 +4107,16 @@ regex_compile (pattern, size, syntax, bufp)
laststart = b;
+#ifdef MBS_SUPPORT
+ /* Is this exactn binary data or character? */
+ is_exactn_bin = is_binary[p - 1 - pattern];
+ if (is_exactn_bin)
+ BUF_PUSH_2 (exactn_bin, 0);
+ else
+ BUF_PUSH_2 (exactn, 0);
+#else
BUF_PUSH_2 (exactn, 0);
+#endif /* MBS_SUPPORT */
pending_exact = b - 1;
}
@@ -3307,10 +4140,19 @@ regex_compile (pattern, size, syntax, bufp)
if (syntax & RE_NO_POSIX_BACKTRACKING)
BUF_PUSH (succeed);
+#ifdef MBS_SUPPORT
+ free (pattern);
+ free (mbs_offset);
+ free (is_binary);
+#endif
free (compile_stack.stack);
/* We have succeeded; set the length of the buffer. */
+#ifdef MBS_SUPPORT
+ bufp->used = (int) b - (int) COMPILED_BUFFER_VAR;
+#else
bufp->used = b - bufp->buffer;
+#endif
#ifdef DEBUG
if (debug)
@@ -3367,44 +4209,47 @@ regex_compile (pattern, size, syntax, bufp)
/* Subroutines for `regex_compile'. */
/* Store OP at LOC followed by two-byte integer parameter ARG. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
store_op1 (op, loc, arg)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg;
{
- *loc = (unsigned char) op;
+ *loc = (US_CHAR_TYPE) op;
STORE_NUMBER (loc + 1, arg);
}
/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
store_op2 (op, loc, arg1, arg2)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg1, arg2;
{
- *loc = (unsigned char) op;
+ *loc = (US_CHAR_TYPE) op;
STORE_NUMBER (loc + 1, arg1);
- STORE_NUMBER (loc + 3, arg2);
+ STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
}
/* Copy the bytes from LOC to END to open up three bytes of space at LOC
for OP followed by two-byte integer parameter ARG. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
insert_op1 (op, loc, arg, end)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg;
- unsigned char *end;
+ US_CHAR_TYPE *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 3;
+ register US_CHAR_TYPE *pfrom = end;
+ register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
@@ -3414,16 +4259,17 @@ insert_op1 (op, loc, arg, end)
/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
insert_op2 (op, loc, arg1, arg2, end)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg1, arg2;
- unsigned char *end;
+ US_CHAR_TYPE *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 5;
+ register US_CHAR_TYPE *pfrom = end;
+ register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
@@ -3438,10 +4284,10 @@ insert_op2 (op, loc, arg1, arg2, end)
static boolean
at_begline_loc_p (pattern, p, syntax)
- const char *pattern, *p;
+ const CHAR_TYPE *pattern, *p;
reg_syntax_t syntax;
{
- const char *prev = p - 2;
+ const CHAR_TYPE *prev = p - 2;
boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
return
@@ -3457,12 +4303,12 @@ at_begline_loc_p (pattern, p, syntax)
static boolean
at_endline_loc_p (p, pend, syntax)
- const char *p, *pend;
+ const CHAR_TYPE *p, *pend;
reg_syntax_t syntax;
{
- const char *next = p;
+ const CHAR_TYPE *next = p;
boolean next_backslash = *next == '\\';
- const char *next_next = p + 1 < pend ? p + 1 : 0;
+ const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0;
return
/* Before a subexpression? */
@@ -3493,7 +4339,97 @@ group_in_compile_stack (compile_stack, regnum)
return false;
}
+#ifdef MBS_SUPPORT
+/* This insert space into the pattern. */
+static void
+insert_space (num, loc, end)
+ int num;
+ CHAR_TYPE *loc;
+ CHAR_TYPE *end;
+{
+ register CHAR_TYPE *pto = end;
+ register CHAR_TYPE *pfrom = end - num;
+
+ while (pfrom >= loc)
+ *pto-- = *pfrom--;
+}
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
+static reg_errcode_t
+compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
+ char_set)
+ CHAR_TYPE range_start_char;
+ const CHAR_TYPE **p_ptr, *pend;
+ CHAR_TYPE *char_set, *b;
+ RE_TRANSLATE_TYPE translate;
+ reg_syntax_t syntax;
+{
+ const CHAR_TYPE *p = *p_ptr;
+ CHAR_TYPE range_start, range_end;
+ reg_errcode_t ret;
+# ifdef _LIBC
+ uint32_t nrules;
+ uint32_t start_val, end_val;
+# endif
+ if (p == pend)
+ return REG_ERANGE;
+
+# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules != 0)
+ {
+ const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
+ _NL_COLLATE_COLLSEQWC);
+
+ if (range_start_char < -1)
+ {
+ /* range_start is a collating symbol. */
+ int32_t *wextra;
+ /* Retreive the index and get collation sequence value. */
+ wextra = (int32_t*)char_set[-range_start_char];
+ start_val = wextra[1 + *wextra];
+ }
+ else
+ start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
+
+ end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
+
+ /* Report an error if the range is empty and the syntax prohibits
+ this. */
+ ret = ((syntax & RE_NO_EMPTY_RANGES)
+ && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
+
+ /* Insert space to the end of the char_ranges. */
+ insert_space(2, b - char_set[5] - 2, b - 1);
+ *(b - char_set[5] - 2) = (wchar_t)start_val;
+ *(b - char_set[5] - 1) = (wchar_t)end_val;
+ char_set[4]++; /* ranges_index */
+ }
+ else
+# endif
+ {
+ range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
+ range_start_char;
+ range_end = TRANSLATE (p[0]);
+ /* Report an error if the range is empty and the syntax prohibits
+ this. */
+ ret = ((syntax & RE_NO_EMPTY_RANGES)
+ && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
+
+ /* Insert space to the end of the char_ranges. */
+ insert_space(2, b - char_set[5] - 2, b - 1);
+ *(b - char_set[5] - 2) = range_start;
+ *(b - char_set[5] - 1) = range_end;
+ char_set[4]++; /* ranges_index */
+ }
+ /* Have to increment the pointer into the pattern string, so the
+ caller isn't still at the ending character. */
+ (*p_ptr)++;
+ return ret;
+}
+#else
/* Read the ending character of a range (in a bracket expression) from the
uncompiled pattern *P_PTR (which ends at PEND). We assume the
starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
@@ -3516,13 +4452,13 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
unsigned this_char;
const char *p = *p_ptr;
reg_errcode_t ret;
-#if _LIBC
+# if _LIBC
const unsigned char *collseq;
unsigned int start_colseq;
unsigned int end_colseq;
-#else
+# else
unsigned end_char;
-#endif
+# endif
if (p == pend)
return REG_ERANGE;
@@ -3534,7 +4470,7 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
/* Report an error if the range is empty and the syntax prohibits this. */
ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-#if _LIBC
+# if _LIBC
collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_COLLSEQMB);
@@ -3550,21 +4486,28 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
ret = REG_NOERROR;
}
}
-#else
+# else
/* Here we see why `this_char' has to be larger than an `unsigned
char' -- we would otherwise go into an infinite loop, since all
characters <= 0xff. */
range_start_char = TRANSLATE (range_start_char);
- end_char = TRANSLATE (p[0]);
+ /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
+ and some compilers cast it to int implicitly, so following for_loop
+ may fall to (almost) infinite loop.
+ e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
+ To avoid this, we cast p[0] to unsigned int and truncate it. */
+ end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
+
for (this_char = range_start_char; this_char <= end_char; ++this_char)
{
SET_LIST_BIT (TRANSLATE (this_char));
ret = REG_NOERROR;
}
-#endif
+# endif
return ret;
}
+#endif /* MBS_SUPPORT */
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -3579,6 +4522,19 @@ compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
Returns 0 if we succeed, -2 if an internal error. */
+#ifdef MBS_SUPPORT
+/* local function for re_compile_fastmap.
+ truncate wchar_t character to char. */
+unsigned char
+truncate_wchar(c)
+ CHAR_TYPE c;
+{
+ unsigned char buf[MB_LEN_MAX];
+ int retval = wctomb(buf, c);
+ return retval > 0 ? buf[0] : (unsigned char)c;
+}
+#endif /* MBS_SUPPORT */
+
int
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
@@ -3592,9 +4548,17 @@ re_compile_fastmap (bufp)
#endif
register char *fastmap = bufp->fastmap;
- unsigned char *pattern = bufp->buffer;
- unsigned char *p = pattern;
- register unsigned char *pend = pattern + bufp->used;
+
+#ifdef MBS_SUPPORT
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer;
+ register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used);
+#else
+ US_CHAR_TYPE *pattern = bufp->buffer;
+ register US_CHAR_TYPE *pend = pattern + bufp->used;
+#endif /* MBS_SUPPORT */
+ US_CHAR_TYPE *p = pattern;
#ifdef REL_ALLOC
/* This holds the pointer to the failure stack, when
@@ -3657,11 +4621,30 @@ re_compile_fastmap (bufp)
/* Following are the cases which match a character. These end
with `break'. */
+#ifdef MBS_SUPPORT
+ case exactn:
+ fastmap[truncate_wchar(p[1])] = 1;
+ break;
+ case exactn_bin:
+ fastmap[p[1]] = 1;
+ break;
+#else
case exactn:
fastmap[p[1]] = 1;
break;
+#endif /* MBS_SUPPORT */
+#ifdef MBS_SUPPORT
+ /* It is hard to distinguish fastmap from (multi byte) characters
+ which depends on current locale. */
+ case charset:
+ case charset_not:
+ case wordchar:
+ case notwordchar:
+ bufp->can_be_null = 1;
+ goto done;
+#else
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
@@ -3692,7 +4675,7 @@ re_compile_fastmap (bufp)
if (SYNTAX (j) != Sword)
fastmap[j] = 1;
break;
-
+#endif
case anychar:
{
@@ -3822,13 +4805,13 @@ re_compile_fastmap (bufp)
case succeed_n:
/* Get to the number of times to succeed. */
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
/* Increment p past the n for when k != 0. */
EXTRACT_NUMBER_AND_INCR (k, p);
if (k == 0)
{
- p -= 4;
+ p -= 2 * OFFSET_ADDRESS_SIZE;
succeed_n_p = true; /* Spaghetti code alert. */
goto handle_on_failure_jump;
}
@@ -3836,7 +4819,7 @@ re_compile_fastmap (bufp)
case set_number_at:
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
continue;
@@ -3913,7 +4896,7 @@ weak_alias (__re_set_registers, re_set_registers)
/* Searching routines. */
/* Like re_search_2, below, but only one string is specified, and
- doesn't let you say where to stop matching. */
+ doesn't let you say where to stop matching. */
int
re_search (bufp, string, size, startpos, range, regs)
@@ -4092,12 +5075,24 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
weak_alias (__re_search_2, re_search_2)
#endif
+#ifdef MBS_SUPPORT
+/* This converts PTR, a pointer into one of the search wchar_t strings
+ `string1' and `string2' into an multibyte string offset from the
+ beginning of that string. We use mbs_offset to optimize.
+ See convert_mbs_to_wcs. */
+# define POINTER_TO_OFFSET(ptr) \
+ (FIRST_STRING_P (ptr) \
+ ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
+ : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
+ + csize1)))
+#else
/* This converts PTR, a pointer into one of the search strings `string1'
and `string2' into an offset from the beginning of that string. */
-#define POINTER_TO_OFFSET(ptr) \
+# define POINTER_TO_OFFSET(ptr) \
(FIRST_STRING_P (ptr) \
? ((regoff_t) ((ptr) - string1)) \
: ((regoff_t) ((ptr) - string2 + size1)))
+#endif /* MBS_SUPPORT */
/* Macros for dealing with the split strings in re_match_2. */
@@ -4127,10 +5122,17 @@ weak_alias (__re_search_2, re_search_2)
two special cases to check for: if past the end of string1, look at
the first character in string2; and if before the beginning of
string2, look at the last character in string1. */
-#define WORDCHAR_P(d) \
+#ifdef MBS_SUPPORT
+/* Use internationalized API instead of SYNTAX. */
+# define WORDCHAR_P(d) \
+ (iswalnum ((wint_t)((d) == end1 ? *string2 \
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0)
+#else
+# define WORDCHAR_P(d) \
(SYNTAX ((d) == end1 ? *string2 \
: (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
== Sword)
+#endif /* MBS_SUPPORT */
/* Disabled due to a compiler bug -- see comment at case wordbound */
#if 0
@@ -4144,7 +5146,28 @@ weak_alias (__re_search_2, re_search_2)
/* Free everything we malloc. */
#ifdef MATCH_MAY_ALLOCATE
# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
-# define FREE_VARIABLES() \
+# ifdef MBS_SUPPORT
+# define FREE_VARIABLES() \
+ do { \
+ REGEX_FREE_STACK (fail_stack.stack); \
+ FREE_VAR (regstart); \
+ FREE_VAR (regend); \
+ FREE_VAR (old_regstart); \
+ FREE_VAR (old_regend); \
+ FREE_VAR (best_regstart); \
+ FREE_VAR (best_regend); \
+ FREE_VAR (reg_info); \
+ FREE_VAR (reg_dummy); \
+ FREE_VAR (reg_info_dummy); \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
+ FREE_VAR (is_binary1); \
+ FREE_VAR (is_binary2); \
+ } while (0)
+# else /* not MBS_SUPPORT */
+# define FREE_VARIABLES() \
do { \
REGEX_FREE_STACK (fail_stack.stack); \
FREE_VAR (regstart); \
@@ -4157,8 +5180,21 @@ weak_alias (__re_search_2, re_search_2)
FREE_VAR (reg_dummy); \
FREE_VAR (reg_info_dummy); \
} while (0)
+# endif /* MBS_SUPPORT */
#else
-# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
+# ifdef MBS_SUPPORT
+# define FREE_VARIABLES() \
+ do { \
+ if (string1) free (string1); \
+ if (string2) free (string2); \
+ if (mbs_offset1) free (mbs_offset1); \
+ if (mbs_offset2) free (mbs_offset2); \
+ if (is_binary1) free (is_binary1); \
+ if (is_binary2) free (is_binary2); \
+ } while (0)
+# eles
+# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
+# endif /* MBS_SUPPORT */
#endif /* not MATCH_MAY_ALLOCATE */
/* These values must meet several constraints. They must not be valid
@@ -4197,16 +5233,16 @@ weak_alias (__re_match, re_match)
# endif
#endif /* not emacs */
-static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
+static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
- unsigned char *end,
+static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
+static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
+static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2,
int len, char *translate));
/* re_match_2 matches the compiled pattern in BUFP against the
@@ -4244,38 +5280,93 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
weak_alias (__re_match_2, re_match_2)
#endif
+#ifdef MBS_SUPPORT
+/* This check the substring (from 0, to length) of the multibyte string,
+ to which offset_buffer correspond. And count how many wchar_t_characters
+ the substring occupy. We use offset_buffer to optimization.
+ See convert_mbs_to_wcs. */
+static int
+count_mbs_length(offset_buffer, length)
+ int *offset_buffer;
+ int length;
+{
+ int wcs_size;
+
+ /* Check whether the size is valid. */
+ if (length < 0)
+ return -1;
+
+ if (offset_buffer == NULL)
+ return 0;
+
+ for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++)
+ {
+ if (offset_buffer[wcs_size] == length)
+ return wcs_size;
+ if (offset_buffer[wcs_size] > length)
+ /* It is a fragment of a wide character. */
+ return -1;
+ }
+
+ /* We reached at the sentinel. */
+ return -1;
+}
+#endif /* MBS_SUPPORT */
+
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
static int
+#ifdef MBS_SUPPORT
+re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *cstring1, *cstring2;
+ int csize1, csize2;
+#else
re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
struct re_pattern_buffer *bufp;
const char *string1, *string2;
int size1, size2;
+#endif
int pos;
struct re_registers *regs;
int stop;
{
/* General temporaries. */
int mcnt;
- unsigned char *p1;
+ US_CHAR_TYPE *p1;
+#ifdef MBS_SUPPORT
+ /* We need wchar_t* buffers correspond to string1, string2. */
+ CHAR_TYPE *string1 = NULL, *string2 = NULL;
+ /* We need the size of wchar_t buffers correspond to csize1, csize2. */
+ int size1 = 0, size2 = 0;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
+ /* They hold whether each wchar_t is binary data or not. */
+ int *is_binary1 = NULL, *is_binary2 = NULL;
+#endif /* MBS_SUPPORT */
/* Just past the end of the corresponding string. */
- const char *end1, *end2;
+ const CHAR_TYPE *end1, *end2;
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
- const char *end_match_1, *end_match_2;
+ const CHAR_TYPE *end_match_1, *end_match_2;
/* Where we are in the data, and the end of the current string. */
- const char *d, *dend;
+ const CHAR_TYPE *d, *dend;
/* Where we are in the pattern, and the end of the pattern. */
- unsigned char *p = bufp->buffer;
- register unsigned char *pend = p + bufp->used;
+#ifdef MBS_SUPPORT
+ US_CHAR_TYPE *pattern, *p;
+ register US_CHAR_TYPE *pend;
+#else
+ US_CHAR_TYPE *p = bufp->buffer;
+ register US_CHAR_TYPE *pend = p + bufp->used;
+#endif /* MBS_SUPPORT */
/* Mark the opcode just after a start_memory, so we can test for an
empty subpattern when we get to the stop_memory. */
- unsigned char *just_past_start_mem = 0;
+ US_CHAR_TYPE *just_past_start_mem = 0;
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -4320,7 +5411,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **regstart, **regend;
+ const CHAR_TYPE **regstart, **regend;
#endif
/* If a group that's operated upon by a repetition operator fails to
@@ -4329,7 +5420,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
are when we last see its open-group operator. Similarly for a
register's end. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **old_regstart, **old_regend;
+ const CHAR_TYPE **old_regstart, **old_regend;
#endif
/* The is_active field of reg_info helps us keep track of which (possibly
@@ -4348,7 +5439,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = false;
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **best_regstart, **best_regend;
+ const CHAR_TYPE **best_regstart, **best_regend;
#endif
/* Logically, this is `best_regend[0]'. But we don't want to have to
@@ -4359,14 +5450,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
the end of the best match so far in a separate variable. We
initialize this to NULL so that when we backtrack the first time
and need to test it, it's not garbage. */
- const char *match_end = NULL;
+ const CHAR_TYPE *match_end = NULL;
/* This helps SET_REGS_MATCHED avoid doing redundant work. */
int set_regs_matched_done = 0;
/* Used when we pop values we don't care about. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **reg_dummy;
+ const CHAR_TYPE **reg_dummy;
register_info_type *reg_info_dummy;
#endif
@@ -4387,14 +5478,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
array indexing. We should fix this. */
if (bufp->re_nsub)
{
- regstart = REGEX_TALLOC (num_regs, const char *);
- regend = REGEX_TALLOC (num_regs, const char *);
- old_regstart = REGEX_TALLOC (num_regs, const char *);
- old_regend = REGEX_TALLOC (num_regs, const char *);
- best_regstart = REGEX_TALLOC (num_regs, const char *);
- best_regend = REGEX_TALLOC (num_regs, const char *);
+ regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
reg_info = REGEX_TALLOC (num_regs, register_info_type);
- reg_dummy = REGEX_TALLOC (num_regs, const char *);
+ reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
if (!(regstart && regend && old_regstart && old_regend && reg_info
@@ -4415,12 +5506,62 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
#endif /* MATCH_MAY_ALLOCATE */
/* The starting position is bogus. */
+#ifdef MBS_SUPPORT
+ if (pos < 0 || pos > csize1 + csize2)
+#else
if (pos < 0 || pos > size1 + size2)
+#endif
{
FREE_VARIABLES ();
return -1;
}
+#ifdef MBS_SUPPORT
+ /* Allocate wchar_t array for string1 and string2 and
+ fill them with converted string. */
+ if (csize1 != 0)
+ {
+ string1 = TALLOC (csize1 + 1, CHAR_TYPE);
+ mbs_offset1 = TALLOC (csize1 + 1, int);
+ is_binary1 = TALLOC (csize1 + 1, int);
+ if (!string1 || !mbs_offset1 || !is_binary1)
+ {
+ if (string1) free(string1);
+ if (mbs_offset1) free(mbs_offset1);
+ if (is_binary1) free(is_binary1);
+ return -2;
+ }
+ size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
+ mbs_offset1, is_binary1);
+ string1[size1] = L'\0'; /* for a sentinel */
+ }
+ if (csize2 != 0)
+ {
+ string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
+ mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
+ is_binary2 = TALLOC (csize2 + 1, int);
+ if (!string2 || !mbs_offset2 || !is_binary2)
+ {
+ if (string1) free(string1);
+ if (mbs_offset1) free(mbs_offset1);
+ if (is_binary1) free(is_binary1);
+ if (string2) free(string2);
+ if (mbs_offset2) free(mbs_offset2);
+ if (is_binary2) free(is_binary2);
+ return -2;
+ }
+ size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
+ mbs_offset2, is_binary2);
+ string2[size2] = L'\0'; /* for a sentinel */
+ }
+
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ p = pattern = (CHAR_TYPE*)bufp->buffer;
+ pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
+
+#endif /* MBS_SUPPORT */
+
/* Initialize subexpression text positions to -1 to mark ones that no
start_memory/stop_memory has been seen for. Also initialize the
register information struct. */
@@ -4448,6 +5589,25 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
end2 = string2 + size2;
/* Compute where to stop matching, within the two strings. */
+#ifdef MBS_SUPPORT
+ if (stop <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, stop);
+ end_match_1 = string1 + mcnt;
+ end_match_2 = string2;
+ }
+ else
+ {
+ end_match_1 = end1;
+ mcnt = count_mbs_length(mbs_offset2, stop-csize1);
+ end_match_2 = string2 + mcnt;
+ }
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (stop <= size1)
{
end_match_1 = string1 + stop;
@@ -4458,6 +5618,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
end_match_1 = end1;
end_match_2 = string2 + stop - size1;
}
+#endif /* MBS_SUPPORT */
/* `p' scans through the pattern as `d' scans through the data.
`dend' is the end of the input string that `d' points within. `d'
@@ -4465,6 +5626,26 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
equal `string2'. */
+#ifdef MBS_SUPPORT
+ if (size1 > 0 && pos <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, pos);
+ d = string1 + mcnt;
+ dend = end_match_1;
+ }
+ else
+ {
+ mcnt = count_mbs_length(mbs_offset2, pos-csize1);
+ d = string2 + mcnt;
+ dend = end_match_2;
+ }
+
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (size1 > 0 && pos <= size1)
{
d = string1 + pos;
@@ -4475,6 +5656,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
d = string2 + pos - size1;
dend = end_match_2;
}
+#endif /* MBS_SUPPORT */
DEBUG_PRINT1 ("The compiled pattern is:\n");
DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
@@ -4564,11 +5746,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
succeed_label:
DEBUG_PRINT1 ("Accepting match.\n");
-
/* If caller wants register contents data back, do it. */
if (regs && !bufp->no_sub)
{
- /* Have the register data arrays been allocated? */
+ /* Have the register data arrays been allocated? */
if (bufp->regs_allocated == REGS_UNALLOCATED)
{ /* No. So allocate them with malloc. We need one
extra element beyond `num_regs' for the `-1' marker
@@ -4612,9 +5793,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (regs->num_regs > 0)
{
regs->start[0] = pos;
+#ifdef MBS_SUPPORT
+ if (MATCHING_IN_FIRST_STRING)
+ regs->end[0] = mbs_offset1 != NULL ?
+ mbs_offset1[d-string1] : 0;
+ else
+ regs->end[0] = csize1 + (mbs_offset2 != NULL ?
+ mbs_offset2[d-string2] : 0);
+#else
regs->end[0] = (MATCHING_IN_FIRST_STRING
? ((regoff_t) (d - string1))
: ((regoff_t) (d - string2 + size1)));
+#endif /* MBS_SUPPORT */
}
/* Go through the first `min (num_regs, regs->num_regs)'
@@ -4647,9 +5837,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
nfailure_points_pushed - nfailure_points_popped);
DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
+#ifdef MBS_SUPPORT
+ if (MATCHING_IN_FIRST_STRING)
+ mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
+ else
+ mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
+ csize1;
+ mcnt -= pos;
+#else
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
? string1
: string2 - size1);
+#endif /* MBS_SUPPORT */
DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
@@ -4674,6 +5873,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
byte in the pattern defines n, and the n bytes after that
are the characters to match. */
case exactn:
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+#endif
mcnt = *p++;
DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
@@ -4684,9 +5886,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if ((unsigned char) translate[(unsigned char) *d++]
- != (unsigned char) *p++)
+#ifdef MBS_SUPPORT
+ if (*d <= 0xff)
+ {
+ if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
+ != (US_CHAR_TYPE) *p++)
+ goto fail;
+ }
+ else
+ {
+ if (*d++ != (CHAR_TYPE) *p++)
+ goto fail;
+ }
+#else
+ if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
+ != (US_CHAR_TYPE) *p++)
goto fail;
+#endif /* MBS_SUPPORT */
}
while (--mcnt);
}
@@ -4695,7 +5911,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if (*d++ != (char) *p++) goto fail;
+ if (*d++ != (CHAR_TYPE) *p++) goto fail;
}
while (--mcnt);
}
@@ -4722,14 +5938,347 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case charset:
case charset_not:
{
- register unsigned char c;
+ register US_CHAR_TYPE c;
+#ifdef MBS_SUPPORT
+ unsigned int i, char_class_length, coll_symbol_length,
+ equiv_class_length, ranges_length, chars_length, length;
+ CHAR_TYPE *workp, *workp2, *charset_top;
+#define WORK_BUFFER_SIZE 128
+ CHAR_TYPE str_buf[WORK_BUFFER_SIZE];
+# ifdef _LIBC
+ uint32_t nrules;
+# endif /* _LIBC */
+#endif /* MBS_SUPPORT */
boolean not = (re_opcode_t) *(p - 1) == charset_not;
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
-
PREFETCH ();
c = TRANSLATE (*d); /* The character to match. */
+#ifdef MBS_SUPPORT
+# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif /* _LIBC */
+ charset_top = p - 1;
+ char_class_length = *p++;
+ coll_symbol_length = *p++;
+ equiv_class_length = *p++;
+ ranges_length = *p++;
+ chars_length = *p++;
+ /* p points charset[6], so the address of the next instruction
+ (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
+ where l=length of char_classes, m=length of collating_symbol,
+ n=equivalence_class, o=length of char_range,
+ p'=length of character. */
+ workp = p;
+ /* Update p to indicate the next instruction. */
+ p += char_class_length + coll_symbol_length+ equiv_class_length +
+ 2*ranges_length + chars_length;
+
+ /* match with char_class? */
+ for (i = 0; i < char_class_length ; i++)
+ if (iswctype((wint_t)c, (wctype_t)(*workp++)))
+ goto char_set_matched;
+
+ /* match with collating_symbol? */
+# ifdef _LIBC
+ if (nrules != 0)
+ {
+ for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
+ workp++)
+ {
+ int32_t *wextra;
+ wextra = (int32_t*) *workp++;
+ for (i = 0; i < *wextra; ++i)
+ if (TRANSLATE(d[i]) != wextra[1 + i])
+ break;
+ if (i == *wextra)
+ {
+ /* Update d, however d will be incremented at
+ char_set_matched:, we decrement d here. */
+ d += i - 1;
+ goto char_set_matched;
+ }
+ }
+ }
+ else /* (nrules == 0) */
+# endif
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ {
+ for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ continue;
+ }
+
+ /* First, we compare the collating symbol with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+ }
+ /* match with equivalence_class? */
+# ifdef _LIBC
+ if (nrules != 0)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ /* Try to match the equivalence class against
+ those known to the collate implementation. */
+ const int32_t *table;
+ const int32_t *weights;
+ const int32_t *extra;
+ const int32_t *indirect;
+ int32_t idx, idx2;
+ wint_t *cp;
+ size_t len;
+
+ /* This #include defines a local function! */
+# include <locale/weightwc.h>
+
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
+ weights = (const wint_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
+ extra = (const wint_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
+
+ /* Write 1 collating element to str_buf, and
+ get its index. */
+ idx2 = 0;
+
+ for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
+ {
+ cp = (wint_t*)str_buf;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+ str_buf[i] = TRANSLATE(*(d+i));
+ str_buf[i+1] = '\0'; /* sentinel */
+ idx2 = findidx ((const wint_t**)&cp);
+ }
+
+ /* Update d, however d will be incremented at
+ char_set_matched:, we decrement d here. */
+ d = backup_d + (wint_t)cp - (wint_t)str_buf - 1;
+ if (d >= dend)
+ {
+ if (dend == end_match_2)
+ d = dend;
+ else
+ {
+ d = string2;
+ dend = end_match_2;
+ }
+ }
+
+ len = weights[idx2];
+
+ for (workp2 = workp + equiv_class_length ; workp < workp2 ;
+ workp++)
+ {
+ idx = (int32_t)*workp;
+ /* We already checked idx != 0 in regex_compile. */
+
+ if (idx2 != 0 && len == weights[idx])
+ {
+ int cnt = 0;
+ while (cnt < len && (weights[idx + 1 + cnt]
+ == weights[idx2 + 1 + cnt]))
+ ++cnt;
+
+ if (cnt == len)
+ goto char_set_matched;
+ }
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ }
+ else /* (nrules == 0) */
+# endif
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ {
+ for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ break;
+ }
+
+ /* First, we compare the equivalence class with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+ }
+
+ /* match with char_range? */
+#ifdef _LIBC
+ if (nrules != 0)
+ {
+ uint32_t collseqval;
+ const char *collseq = (const char *)
+ _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
+
+ collseqval = collseq_table_lookup (collseq, c);
+
+ for (; workp < p - chars_length ;)
+ {
+ uint32_t start_val, end_val;
+
+ /* We already compute the collation sequence value
+ of the characters (or collating symbols). */
+ start_val = (uint32_t) *workp++; /* range_start */
+ end_val = (uint32_t) *workp++; /* range_end */
+
+ if (start_val <= collseqval && collseqval <= end_val)
+ goto char_set_matched;
+ }
+ }
+ else
+#endif
+ {
+ /* We set range_start_char at str_buf[0], range_end_char
+ at str_buf[4], and compared char at str_buf[2]. */
+ str_buf[1] = 0;
+ str_buf[2] = c;
+ str_buf[3] = 0;
+ str_buf[5] = 0;
+ for (; workp < p - chars_length ;)
+ {
+ wchar_t *range_start_char, *range_end_char;
+
+ /* match if (range_start_char <= c <= range_end_char). */
+
+ /* If range_start(or end) < 0, we assume -range_start(end)
+ is the offset of the collating symbol which is specified
+ as the character of the range start(end). */
+
+ /* range_start */
+ if (*workp < 0)
+ range_start_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[0] = *workp++;
+ range_start_char = str_buf;
+ }
+
+ /* range_end */
+ if (*workp < 0)
+ range_end_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[4] = *workp++;
+ range_end_char = str_buf + 4;
+ }
+
+ if (wcscoll(range_start_char, str_buf+2) <= 0 &&
+ wcscoll(str_buf+2, range_end_char) <= 0)
+
+ goto char_set_matched;
+ }
+ }
+
+ /* match with char? */
+ for (; workp < p ; workp++)
+ if (c == *workp)
+ goto char_set_matched;
+
+ not = !not;
+
+ char_set_matched:
+ if (not) goto fail;
+#else
/* Cast to `unsigned' instead of `unsigned char' in case the
bit list is a full 32 bytes long. */
if (c < (unsigned) (*p * BYTEWIDTH)
@@ -4739,7 +6288,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
p += 1 + *p;
if (!not) goto fail;
-
+#undef WORK_BUFFER_SIZE
+#endif /* MBS_SUPPORT */
SET_REGS_MATCHED ();
d++;
break;
@@ -4834,7 +6384,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
it isn't necessarily one less than now: consider
(a(b)c(d(e)f)g). When group 3 ends, after the f), the
new highest active register is 1. */
- unsigned char r = *p - 1;
+ US_CHAR_TYPE r = *p - 1;
while (r > 0 && !IS_ACTIVE (reg_info[r]))
r--;
@@ -4877,7 +6427,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case dummy_failure_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (is_a_jump_n)
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
break;
default:
@@ -4891,7 +6441,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
by forcing a failure after pushing on the stack the
on_failure_jump's jump in the pattern, and d. */
if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
- && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
+ && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
+ && p1[2+OFFSET_ADDRESS_SIZE] == *p)
{
/* If this group ever matched anything, then restore
what its registers were before trying this last
@@ -4937,7 +6488,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
followed by the numeric value of <digit> as the register number. */
case duplicate:
{
- register const char *d2, *dend2;
+ register const CHAR_TYPE *d2, *dend2;
int regno = *p++; /* Get which register to match against. */
DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
@@ -4987,7 +6538,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
past them. */
if (translate
? bcmp_translate (d, d2, mcnt, translate)
- : memcmp (d, d2, mcnt))
+ : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
goto fail;
d += mcnt, d2 += mcnt;
@@ -5143,7 +6694,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
{
- register unsigned char *p2 = p;
+ register US_CHAR_TYPE *p2 = p;
/* Compare the beginning of the repeat with what in the
pattern follows its end. If we can establish that there
@@ -5168,9 +6719,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
&& ((re_opcode_t) *p2 == stop_memory
|| (re_opcode_t) *p2 == start_memory))
p2 += 3;
- else if (p2 + 6 < pend
+ else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
&& (re_opcode_t) *p2 == dummy_failure_jump)
- p2 += 6;
+ p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
else
break;
}
@@ -5186,24 +6737,34 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Consider what happens when matching ":\(.*\)"
against ":/". I don't really understand this code
yet. */
- p[-3] = (unsigned char) pop_failure_jump;
+ p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
+ pop_failure_jump;
DEBUG_PRINT1
(" End of pattern: change to `pop_failure_jump'.\n");
}
else if ((re_opcode_t) *p2 == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) *p2 == exactn_bin
+#endif
|| (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
{
- register unsigned char c
- = *p2 == (unsigned char) endline ? '\n' : p2[2];
+ register US_CHAR_TYPE c
+ = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2];
- if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
+ if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
+#endif
+ ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
{
- p[-3] = (unsigned char) pop_failure_jump;
+ p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
+ pop_failure_jump;
DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
- c, p1[5]);
+ c, p1[3+OFFSET_ADDRESS_SIZE]);
}
+#ifndef MBS_SUPPORT
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
@@ -5221,7 +6782,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
+#endif /* not MBS_SUPPORT */
}
+#ifndef MBS_SUPPORT
else if ((re_opcode_t) *p2 == charset)
{
/* We win if the first character of the loop is not part
@@ -5270,11 +6833,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
}
}
}
+#endif /* not MBS_SUPPORT */
}
- p -= 2; /* Point at relative address again. */
+ p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
- p[-1] = (unsigned char) jump;
+ p[-1] = (US_CHAR_TYPE) jump;
DEBUG_PRINT1 (" Match => jump.\n");
goto unconditional_jump;
}
@@ -5295,8 +6859,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
register from the stack, since lowest will == highest in
`pop_failure_point'. */
active_reg_t dummy_low_reg, dummy_high_reg;
- unsigned char *pdummy;
- const char *sdummy;
+ US_CHAR_TYPE *pdummy = NULL;
+ const CHAR_TYPE *sdummy = NULL;
DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
POP_FAILURE_POINT (sdummy, pdummy,
@@ -5361,7 +6925,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Have to succeed matching what follows at least n times.
After that, handle like `on_failure_jump'. */
case succeed_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
assert (mcnt >= 0);
@@ -5369,46 +6933,58 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (mcnt > 0)
{
mcnt--;
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
STORE_NUMBER_AND_INCR (p, mcnt);
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
+ , mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
+ , mcnt);
#endif
}
else if (mcnt == 0)
{
#ifdef _LIBC
- DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
+ DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
#else
- DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
-#endif
- p[2] = (unsigned char) no_op;
- p[3] = (unsigned char) no_op;
+ DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
+#endif /* _LIBC */
+
+#ifdef MBS_SUPPORT
+ p[1] = (US_CHAR_TYPE) no_op;
+#else
+ p[2] = (US_CHAR_TYPE) no_op;
+ p[3] = (US_CHAR_TYPE) no_op;
+#endif /* MBS_SUPPORT */
goto on_failure;
}
break;
case jump_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
/* Originally, this is how many times we CAN jump. */
if (mcnt)
{
mcnt--;
- STORE_NUMBER (p + 2, mcnt);
+ STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
+
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
-#endif
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
+#endif /* _LIBC */
goto unconditional_jump;
}
/* If don't have to jump any more, skip over the rest of command. */
else
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
break;
case set_number_at:
@@ -5640,12 +7216,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
static boolean
group_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
+ US_CHAR_TYPE **p, *end;
register_info_type *reg_info;
{
int mcnt;
/* Point to after the args to the start_memory. */
- unsigned char *p1 = *p + 2;
+ US_CHAR_TYPE *p1 = *p + 2;
while (p1 < end)
{
@@ -5683,14 +7259,16 @@ group_match_null_string_p (p, end, reg_info)
with an on_failure_jump (see above) that jumps to right
past a jump_past_alt. */
- while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
+ while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
+ jump_past_alt)
{
/* `mcnt' holds how many bytes long the alternative
is, including the ending `jump_past_alt' and
its number. */
- if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
- reg_info))
+ if (!alt_match_null_string_p (p1, p1 + mcnt -
+ (1 + OFFSET_ADDRESS_SIZE),
+ reg_info))
return false;
/* Move to right after this alternative, including the
@@ -5706,10 +7284,11 @@ group_match_null_string_p (p, end, reg_info)
alternative that starts with an on_failure_jump. */
p1++;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
+ if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
+ jump_past_alt)
{
/* Get to the beginning of the n-th alternative. */
- p1 -= 3;
+ p1 -= 1 + OFFSET_ADDRESS_SIZE;
break;
}
}
@@ -5717,7 +7296,7 @@ group_match_null_string_p (p, end, reg_info)
/* Deal with the last alternative: go back and get number
of the `jump_past_alt' just before it. `mcnt' contains
the length of the alternative. */
- EXTRACT_NUMBER (mcnt, p1 - 2);
+ EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
return false;
@@ -5749,11 +7328,11 @@ group_match_null_string_p (p, end, reg_info)
static boolean
alt_match_null_string_p (p, end, reg_info)
- unsigned char *p, *end;
+ US_CHAR_TYPE *p, *end;
register_info_type *reg_info;
{
int mcnt;
- unsigned char *p1 = p;
+ US_CHAR_TYPE *p1 = p;
while (p1 < end)
{
@@ -5786,13 +7365,13 @@ alt_match_null_string_p (p, end, reg_info)
static boolean
common_op_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
+ US_CHAR_TYPE **p, *end;
register_info_type *reg_info;
{
int mcnt;
boolean ret;
int reg_no;
- unsigned char *p1 = *p;
+ US_CHAR_TYPE *p1 = *p;
switch ((re_opcode_t) *p1++)
{
@@ -5838,12 +7417,12 @@ common_op_match_null_string_p (p, end, reg_info)
case succeed_n:
/* Get to the number of times to succeed. */
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (mcnt == 0)
{
- p1 -= 4;
+ p1 -= 2 * OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
p1 += mcnt;
}
@@ -5857,7 +7436,7 @@ common_op_match_null_string_p (p, end, reg_info)
break;
case set_number_at:
- p1 += 4;
+ p1 += 2 * OFFSET_ADDRESS_SIZE;
default:
/* All other opcodes mean we cannot match the empty string. */
@@ -5874,15 +7453,21 @@ common_op_match_null_string_p (p, end, reg_info)
static int
bcmp_translate (s1, s2, len, translate)
- const char *s1, *s2;
+ const CHAR_TYPE *s1, *s2;
register int len;
RE_TRANSLATE_TYPE translate;
{
- register const unsigned char *p1 = (const unsigned char *) s1;
- register const unsigned char *p2 = (const unsigned char *) s2;
+ register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1;
+ register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2;
while (len)
{
+#ifdef MBS_SUPPORT
+ if (((*p1<=0xff)?translate[*p1++]:*p1++)
+ != ((*p2<=0xff)?translate[*p2++]:*p2++))
+ return 1;
+#else
if (translate[*p1++] != translate[*p2++]) return 1;
+#endif /* MBS_SUPPORT */
len--;
}
return 0;