diff options
author | Ulrich Drepper <drepper@redhat.com> | 2002-04-24 21:54:53 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2002-04-24 21:54:53 +0000 |
commit | 612546c60dd28d7af44fbb2bc98c69c33b4a0c49 (patch) | |
tree | 11dc6bf94bd7beb3271366b33a86be162b3bfa6e /posix | |
parent | be479a6dfe81c5b426e2cbabd62be2c042ab2d45 (diff) | |
download | glibc-612546c60dd28d7af44fbb2bc98c69c33b4a0c49.zip glibc-612546c60dd28d7af44fbb2bc98c69c33b4a0c49.tar.gz glibc-612546c60dd28d7af44fbb2bc98c69c33b4a0c49.tar.bz2 |
Update.
2002-04-22 Isamu Hasegawa <isamu@yamato.ibm.com>
* posix/regcomp.c (re_compile_internal): Adapt it to new interface
of buffer building functions.
* posix/regex_internal.c (re_string_allocate): New function.
(re_string_realloc_buffers): New function.
(re_string_skip_chars): New function.
(re_string_reconstruct): New function.
(re_string_construct): Adapt it to new interface of buffer building
functions.
(re_string_construct_common): Likewise.
(build_wcs_buffer): Likewise.
(build_wcs_upper_buffer): Likewise.
(build_upper_buffer): Likewise.
(re_string_translate_buffer): Likewise.
(re_string_context_at): Adapt it to variable length buffers.
* posix/regex_internal.h (re_string_t): Add new fields to handle
variable length buffers.
(re_match_context_t): Likewise.
* posix/regexec.c (re_search_internal): Adapt it to new interface
of re_string_t and re_match_context_t.
(acquire_init_state_context): Likewise.
(check_matching): Likewise.
(check_halt_state_context): Likewise.
(proceed_next_node): Likewise.
(set_regs): Likewise.
(sift_states_backward): Likewise.
(clean_state_log_if_need): Likewise.
(sift_states_iter_mb): Likewise.
(sift_states_iter_bkref): Likewise.
(add_epsilon_backreference): Likewise.
(transit_state): Likewise.
(transit_state_sb): Likewise.
(transit_state_mb): Likewise.
(transit_state_bkref): Likewise.
(transit_state_bkref_loop): Likewise.
(check_node_accept): Likewise.
(match_ctx_init): Likewise.
(extend_buffers): New function.
2002-04-21 Bruno Haible <bruno@clisp.org>
* iconvdata/tst-table.sh: For the second check, use the truncated
GB18030 charmap table, like for the first check.
Diffstat (limited to 'posix')
-rw-r--r-- | posix/regcomp.c | 8 | ||||
-rw-r--r-- | posix/regex_internal.c | 511 | ||||
-rw-r--r-- | posix/regex_internal.h | 112 | ||||
-rw-r--r-- | posix/regexec.c | 631 |
4 files changed, 760 insertions, 502 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c index 04d67a1..149814c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -692,12 +692,8 @@ re_compile_internal (preg, pattern, length, syntax) return err; } - if (syntax & RE_ICASE) - err = re_string_construct_toupper (®exp, pattern, length, - preg->translate); - else - err = re_string_construct (®exp, pattern, length, preg->translate); - + err = re_string_construct (®exp, pattern, length, preg->translate, + syntax & RE_ICASE); if (BE (err != REG_NOERROR, 0)) { re_free (dfa); diff --git a/posix/regex_internal.c b/posix/regex_internal.c index 8b16dd6..b688d0f 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -58,14 +58,9 @@ #include "regex_internal.h" static void re_string_construct_common (const unsigned char *str, - int len, re_string_t *pstr); -#ifdef RE_ENABLE_I18N -static reg_errcode_t build_wcs_buffer (re_string_t *pstr); -static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr); -#endif /* RE_ENABLE_I18N */ -static reg_errcode_t build_upper_buffer (re_string_t *pstr); -static reg_errcode_t re_string_translate_buffer (re_string_t *pstr, - RE_TRANSLATE_TYPE trans); + int len, re_string_t *pstr, + RE_TRANSLATE_TYPE trans, int icase); +static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx); static re_dfastate_t *create_newstate_common (re_dfa_t *dfa, const re_node_set *nodes, unsigned int hash); @@ -83,278 +78,416 @@ static unsigned int inline calc_state_hash (const re_node_set *nodes, /* Functions for string operation. */ -/* Construct string object. */ +/* This function allocate the buffers. It is necessary to call + re_string_reconstruct before using the object. */ + static reg_errcode_t -re_string_construct (pstr, str, len, trans) +re_string_allocate (pstr, str, len, init_len, trans, icase) re_string_t *pstr; const unsigned char *str; - int len; + int len, init_len, icase; RE_TRANSLATE_TYPE trans; { reg_errcode_t ret; - re_string_construct_common (str, len, pstr); -#ifdef RE_ENABLE_I18N - if (MB_CUR_MAX >1 && pstr->len > 0) + int init_buf_len = (len + 1 < init_len) ? len + 1: init_len; + re_string_construct_common (str, len, pstr, trans, icase); + + ret = re_string_realloc_buffers (pstr, init_buf_len); + if (BE (ret != REG_NOERROR, 0)) + return ret; + + pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case + : (unsigned char *)str); + pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; + pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr) + || MB_CUR_MAX > 1) ? pstr->valid_len : len; + return REG_NOERROR; +} + +/* This function allocate the buffers, and initialize them. */ + +static reg_errcode_t +re_string_construct (pstr, str, len, trans, icase) + re_string_t *pstr; + const unsigned char *str; + int len, icase; + RE_TRANSLATE_TYPE trans; +{ + reg_errcode_t ret; + re_string_construct_common (str, len, pstr, trans, icase); + /* Set 0 so that this function can initialize whole buffers. */ + pstr->valid_len = 0; + + if (len > 0) { - ret = build_wcs_buffer (pstr); + ret = re_string_realloc_buffers (pstr, len + 1); if (BE (ret != REG_NOERROR, 0)) return ret; } + pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case + : (unsigned char *)str); + pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; + + if (icase) + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) + build_wcs_upper_buffer (pstr); + else + build_upper_buffer (pstr); #endif /* RE_ENABLE_I18N */ - pstr->mbs_case = str; - if (trans != NULL) + } + else { - ret = re_string_translate_buffer (pstr, trans); - if (BE (ret != REG_NOERROR, 0)) - return ret; +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) + build_wcs_buffer (pstr); + else +#endif /* RE_ENABLE_I18N */ + { + if (trans != NULL) + re_string_translate_buffer (pstr); + else + pstr->valid_len = len; + } } + + /* Initialized whole buffers, then valid_len == bufs_len. */ + pstr->valid_len = pstr->bufs_len; return REG_NOERROR; } -/* Construct string object. We use this function instead of - re_string_construct for case insensitive mode. */ +/* Helper functions for re_string_allocate, and re_string_construct. */ static reg_errcode_t -re_string_construct_toupper (pstr, str, len, trans) +re_string_realloc_buffers (pstr, new_buf_len) re_string_t *pstr; - const unsigned char *str; - int len; - RE_TRANSLATE_TYPE trans; + int new_buf_len; { - reg_errcode_t ret; - /* Set case sensitive buffer. */ - re_string_construct_common (str, len, pstr); #ifdef RE_ENABLE_I18N - if (MB_CUR_MAX >1) + if (MB_CUR_MAX > 1) { - if (BE (pstr->len > 0, 1)) - { - ret = build_wcs_upper_buffer (pstr); - if (BE (ret != REG_NOERROR, 0)) - return ret; - } + pstr->wcs = re_realloc (pstr->wcs, wchar_t, new_buf_len); + if (BE (pstr->wcs == NULL, 0)) + return REG_ESPACE; } - else #endif /* RE_ENABLE_I18N */ + if (MBS_ALLOCATED (pstr)) { - if (BE (pstr->len > 0, 1)) - { - ret = build_upper_buffer (pstr); - if (BE (ret != REG_NOERROR, 0)) - return ret; - } + pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len); + if (BE (pstr->mbs == NULL, 0)) + return REG_ESPACE; } - pstr->mbs_case = str; - if (trans != NULL) + if (MBS_CASE_ALLOCATED (pstr)) { - ret = re_string_translate_buffer (pstr, trans); - if (BE (ret != REG_NOERROR, 0)) - return ret; + pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len); + if (BE (pstr->mbs_case == NULL, 0)) + return REG_ESPACE; + if (!MBS_ALLOCATED (pstr)) + pstr->mbs = pstr->mbs_case; } + pstr->bufs_len = new_buf_len; return REG_NOERROR; } -/* Helper functions for re_string_construct_*. */ + static void -re_string_construct_common (str, len, pstr) +re_string_construct_common (str, len, pstr, trans, icase) const unsigned char *str; int len; re_string_t *pstr; + RE_TRANSLATE_TYPE trans; + int icase; { - pstr->mbs = str; - pstr->cur_idx = 0; + memset (pstr, '\0', sizeof (re_string_t)); + pstr->raw_mbs = str; pstr->len = len; -#ifdef RE_ENABLE_I18N - pstr->wcs = NULL; -#endif - pstr->mbs_case = NULL; - pstr->mbs_alloc = 0; - pstr->mbs_case_alloc = 0; + pstr->trans = trans; + pstr->icase = icase ? 1 : 0; } #ifdef RE_ENABLE_I18N -/* Build wide character buffer for `pstr'. +/* Build wide character buffer PSTR->WCS. If the byte sequence of the string are: <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> Then wide character buffer will be: <wc1> , WEOF , <wc2> , WEOF , <wc3> We use WEOF for padding, they indicate that the position isn't - a first byte of a multibyte character. */ + a first byte of a multibyte character. -static reg_errcode_t + Note that this function assumes PSTR->VALID_LEN elements are already + built and starts from PSTR->VALID_LEN. */ + +static void build_wcs_buffer (pstr) re_string_t *pstr; { - mbstate_t state, prev_st; - wchar_t wc; - int char_idx, char_len, mbclen; - - pstr->wcs = re_malloc (wchar_t, pstr->len + 1); - if (BE (pstr->wcs == NULL, 0)) - return REG_ESPACE; - - memset (&state, '\0', sizeof (mbstate_t)); - char_len = pstr->len; - for (char_idx = 0; char_idx < char_len ;) + mbstate_t prev_st; + int byte_idx, end_idx, mbclen, remain_len; + /* Build the buffers from pstr->valid_len to either pstr->len or + pstr->bufs_len. */ + end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; + for (byte_idx = pstr->valid_len; byte_idx < end_idx;) { - int next_idx, remain_len = char_len - char_idx; - prev_st = state; - mbclen = mbrtowc (&wc, pstr->mbs + char_idx, remain_len, &state); - if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) - /* We treat these cases as a singlebyte character. */ + wchar_t wc; + remain_len = end_idx - byte_idx; + prev_st = pstr->cur_state; + mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, + remain_len, &pstr->cur_state); + if (BE (mbclen == (size_t) -2, 0)) + { + /* The buffer doesn't have enough space, finish to build. */ + pstr->cur_state = prev_st; + break; + } + else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) { + /* We treat these cases as a singlebyte character. */ mbclen = 1; - wc = (wchar_t) pstr->mbs[char_idx++]; - state = prev_st; + wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; + pstr->cur_state = prev_st; + } + + /* Apply the translateion if we need. */ + if (pstr->trans != NULL && mbclen == 1) + { + int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]]; + pstr->mbs_case[byte_idx] = ch; } /* Write wide character and padding. */ - pstr->wcs[char_idx++] = wc; - for (next_idx = char_idx + mbclen - 1; char_idx < next_idx ;) - pstr->wcs[char_idx++] = WEOF; + pstr->wcs[byte_idx++] = wc; + /* Write paddings. */ + for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) + pstr->wcs[byte_idx++] = WEOF; } - return REG_NOERROR; + pstr->valid_len = byte_idx; } -static reg_errcode_t +/* Build wide character buffer PSTR->WCS like build_wcs_buffer, + but for REG_ICASE. */ + +static void build_wcs_upper_buffer (pstr) re_string_t *pstr; { - mbstate_t state, prev_st; - wchar_t wc; - unsigned char *mbs_upper; - int char_idx, char_len, mbclen; - - pstr->wcs = re_malloc (wchar_t, pstr->len + 1); - mbs_upper = re_malloc (unsigned char, pstr->len + 1); - if (BE (pstr->wcs == NULL || mbs_upper == NULL, 0)) + mbstate_t prev_st; + int byte_idx, end_idx, mbclen, remain_len; + /* Build the buffers from pstr->valid_len to either pstr->len or + pstr->bufs_len. */ + end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; + for (byte_idx = pstr->valid_len; byte_idx < end_idx;) { - pstr->wcs = NULL; - return REG_ESPACE; - } - - memset (&state, '\0', sizeof (mbstate_t)); - char_len = pstr->len; - for (char_idx = 0 ; char_idx < char_len ; char_idx += mbclen) - { - int byte_idx, remain_len = char_len - char_idx; - prev_st = state; - mbclen = mbrtowc (&wc, pstr->mbs + char_idx, remain_len, &state); - if (mbclen == 1) + wchar_t wc; + remain_len = end_idx - byte_idx; + prev_st = pstr->cur_state; + mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, + remain_len, &pstr->cur_state); + if (BE (mbclen == (size_t) -2, 0)) { - pstr->wcs[char_idx] = wc; - if (islower (pstr->mbs[char_idx])) - mbs_upper[char_idx] = toupper (pstr->mbs[char_idx]); - else - mbs_upper[char_idx] = pstr->mbs[char_idx]; + /* The buffer doesn't have enough space, finish to build. */ + pstr->cur_state = prev_st; + break; } - else if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 - || mbclen == 0, 0)) - /* We treat these cases as a singlebyte character. */ + else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0) { - mbclen = 1; - pstr->wcs[char_idx] = (wchar_t) pstr->mbs[char_idx]; - mbs_upper[char_idx] = pstr->mbs[char_idx]; - state = prev_st; + /* In case of a singlebyte character. */ + int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; + /* Apply the translateion if we need. */ + if (pstr->trans != NULL && mbclen == 1) + { + ch = pstr->trans[ch]; + pstr->mbs_case[byte_idx] = ch; + } + pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc; + pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch; + if (BE (mbclen == (size_t) -1, 0)) + pstr->cur_state = prev_st; } else /* mbclen > 1 */ { - pstr->wcs[char_idx] = wc; if (iswlower (wc)) - wcrtomb (mbs_upper + char_idx, towupper (wc), &prev_st); + wcrtomb (pstr->mbs + byte_idx, towupper (wc), &prev_st); else - memcpy (mbs_upper + char_idx, pstr->mbs + char_idx, mbclen); - for (byte_idx = 1 ; byte_idx < mbclen ; byte_idx++) - pstr->wcs[char_idx + byte_idx] = WEOF; + memcpy (pstr->mbs + byte_idx, + pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); + pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc; + /* Write paddings. */ + for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) + pstr->wcs[byte_idx++] = WEOF; } } - pstr->mbs = mbs_upper; - pstr->mbs_alloc = 1; - return REG_NOERROR; + pstr->valid_len = byte_idx; +} + +/* Skip characters until the index becomes greater than NEW_RAW_IDX. + Return the index. */ + +static int +re_string_skip_chars (pstr, new_raw_idx) + re_string_t *pstr; + int new_raw_idx; +{ + mbstate_t prev_st; + int rawbuf_idx, mbclen; + + /* Skip the characters which are not necessary to check. */ + for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len; + rawbuf_idx < new_raw_idx;) + { + int remain_len = pstr->len - rawbuf_idx; + prev_st = pstr->cur_state; + mbclen = mbrlen (pstr->raw_mbs + rawbuf_idx, remain_len, + &pstr->cur_state); + if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) + { + /* We treat these cases as a singlebyte character. */ + mbclen = 1; + pstr->cur_state = prev_st; + } + /* Then proceed the next character. */ + rawbuf_idx += mbclen; + } + return rawbuf_idx; } #endif /* RE_ENABLE_I18N */ -static reg_errcode_t +/* Build the buffer PSTR->MBS, and apply the translation if we need. + This function is used in case of REG_ICASE. */ + +static void build_upper_buffer (pstr) re_string_t *pstr; { - unsigned char *mbs_upper; - int char_idx, char_len; - - mbs_upper = re_malloc (unsigned char, pstr->len + 1); - if (BE (mbs_upper == NULL, 0)) - return REG_ESPACE; + int char_idx, end_idx; + end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; - char_len = pstr->len; - for (char_idx = 0 ; char_idx < char_len ; char_idx ++) + for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) { - if (islower (pstr->mbs[char_idx])) - mbs_upper[char_idx] = toupper (pstr->mbs[char_idx]); + int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; + if (pstr->trans != NULL) + { + ch = pstr->trans[ch]; + pstr->mbs_case[char_idx] = ch; + } + if (islower (ch)) + pstr->mbs[char_idx] = toupper (ch); else - mbs_upper[char_idx] = pstr->mbs[char_idx]; + pstr->mbs[char_idx] = ch; } - pstr->mbs = mbs_upper; - pstr->mbs_alloc = 1; - return REG_NOERROR; + pstr->valid_len = char_idx; } -/* Apply TRANS to the buffer in PSTR. We assume that wide char buffer - is already constructed if MB_CUR_MAX > 1. */ +/* Apply TRANS to the buffer in PSTR. */ -static reg_errcode_t -re_string_translate_buffer (pstr, trans) +static void +re_string_translate_buffer (pstr) re_string_t *pstr; - RE_TRANSLATE_TYPE trans; { - int buf_idx; - unsigned char *transed_buf, *transed_case_buf; -#ifdef DEBUG - assert (trans != NULL); -#endif - if (pstr->mbs_alloc) + int buf_idx, end_idx; + end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; + + for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) { - transed_buf = (unsigned char *) pstr->mbs; - transed_case_buf = re_malloc (unsigned char, pstr->len + 1); - if (BE (transed_case_buf == NULL, 0)) - return REG_ESPACE; - pstr->mbs_case_alloc = 1; + int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; + pstr->mbs_case[buf_idx] = pstr->trans[ch]; } - else + + pstr->valid_len = buf_idx; +} + +/* This function re-construct the buffers. + Concretely, convert to wide character in case of MB_CUR_MAX > 1, + convert to upper case in case of REG_ICASE, apply translation. */ + +static reg_errcode_t +re_string_reconstruct (pstr, idx, eflags, newline) + re_string_t *pstr; + int idx, eflags, newline; +{ + int offset = idx - pstr->raw_mbs_idx; + if (offset < 0) { - transed_buf = re_malloc (unsigned char, pstr->len + 1); - if (BE (transed_buf == NULL, 0)) - return REG_ESPACE; - transed_case_buf = NULL; - pstr->mbs_alloc = 1; + /* Reset buffer. */ + memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); + pstr->valid_len = pstr->raw_mbs_idx = 0; + pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF + : CONTEXT_NEWLINE | CONTEXT_BEGBUF); + if (!MBS_CASE_ALLOCATED (pstr)) + pstr->mbs_case = (unsigned char *)pstr->raw_mbs; + if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr)) + pstr->mbs = (unsigned char *)pstr->raw_mbs; + offset = idx; } - for (buf_idx = 0 ; buf_idx < pstr->len ; buf_idx++) + + if (offset != 0) { + pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, + newline); + /* Are the characters which are already checked remain? */ + if (offset < pstr->valid_len) + { + /* Yes, move them to the front of the buffer. */ #ifdef RE_ENABLE_I18N - if (MB_CUR_MAX > 1 && !re_string_is_single_byte_char (pstr, buf_idx)) - transed_buf[buf_idx] = pstr->mbs[buf_idx]; - else + if (MB_CUR_MAX > 1) + memmove (pstr->wcs, pstr->wcs + offset, + (pstr->valid_len - offset) * sizeof (wchar_t)); +#endif /* RE_ENABLE_I18N */ + if (MBS_ALLOCATED (pstr)) + memmove (pstr->mbs, pstr->mbs + offset, + pstr->valid_len - offset); + if (MBS_CASE_ALLOCATED (pstr)) + memmove (pstr->mbs_case, pstr->mbs_case + offset, + pstr->valid_len - offset); + pstr->valid_len -= offset; +#if DEBUG + assert (pstr->valid_len > 0); #endif - transed_buf[buf_idx] = trans[pstr->mbs[buf_idx]]; - if (transed_case_buf) + } + else { + /* No, skip all characters until IDX. */ + pstr->valid_len = 0; #ifdef RE_ENABLE_I18N - if (MB_CUR_MAX > 1 && !re_string_is_single_byte_char (pstr, buf_idx)) - transed_case_buf[buf_idx] = pstr->mbs_case[buf_idx]; - else -#endif - transed_case_buf[buf_idx] = trans[pstr->mbs_case[buf_idx]]; + if (MB_CUR_MAX > 1) + { + int wcs_idx; + pstr->valid_len = re_string_skip_chars (pstr, idx) - idx; + for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) + pstr->wcs[wcs_idx] = WEOF; + } +#endif /* RE_ENABLE_I18N */ + } + if (!MBS_CASE_ALLOCATED (pstr)) + { + pstr->mbs_case += offset; + /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */ + if (!MBS_ALLOCATED (pstr)) + pstr->mbs += offset; } } - if (pstr->mbs_case_alloc == 1) + pstr->raw_mbs_idx = idx; + pstr->len -= offset; + + /* Then build the buffers. */ +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) { - pstr->mbs = transed_buf; - pstr->mbs_case = transed_case_buf; + if (pstr->icase) + build_wcs_upper_buffer (pstr); + else + build_wcs_buffer (pstr); } else +#endif /* RE_ENABLE_I18N */ { - pstr->mbs = transed_buf; - pstr->mbs_case = transed_buf; + if (pstr->icase) + build_upper_buffer (pstr); + else if (pstr->trans != NULL) + re_string_translate_buffer (pstr); } + pstr->cur_idx = 0; + return REG_NOERROR; } @@ -365,13 +498,14 @@ re_string_destruct (pstr) #ifdef RE_ENABLE_I18N re_free (pstr->wcs); #endif /* RE_ENABLE_I18N */ - if (pstr->mbs_alloc) - re_free ((void *) pstr->mbs); - if (pstr->mbs_case_alloc) - re_free ((void *) pstr->mbs_case); + if (MBS_ALLOCATED (pstr)) + re_free (pstr->mbs); + if (MBS_CASE_ALLOCATED (pstr)) + re_free (pstr->mbs_case); } /* Return the context at IDX in INPUT. */ + static unsigned int re_string_context_at (input, idx, eflags, newline_anchor) const re_string_t *input; @@ -380,17 +514,13 @@ re_string_context_at (input, idx, eflags, newline_anchor) int c; if (idx < 0 || idx == input->len) { - unsigned int context = 0; if (idx < 0) - context = CONTEXT_BEGBUF; + /* In this case, we use the value stored in input->tip_context, + since we can't know the character in input->mbs[-1] here. */ + return input->tip_context; else /* (idx == input->len) */ - context = CONTEXT_ENDBUF; - - if ((idx < 0 && !(eflags & REG_NOTBOL)) - || (idx == input->len && !(eflags & REG_NOTEOL))) - return CONTEXT_NEWLINE | context; - else - return context; + return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF + : CONTEXT_NEWLINE | CONTEXT_ENDBUF); } c = re_string_byte_at (input, idx); if (IS_WORD_CHAR (c)) @@ -737,6 +867,7 @@ re_node_set_insert (set, elem) if (set->nelem - idx > 0) memcpy (new_array + idx + 1, set->elems + idx, sizeof (int) * (set->nelem - idx)); + re_free (set->elems); set->elems = new_array; } else diff --git a/posix/regex_internal.h b/posix/regex_internal.h index bb28102..f676ae2 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -201,33 +201,67 @@ typedef struct struct re_string_t { + /* Indicate the raw buffer which is the original string passed as an + argument of regexec(), re_search(), etc.. */ + const unsigned char *raw_mbs; + /* Index in RAW_MBS. Each character mbs[i] corresponds to + raw_mbs[raw_mbs_idx + i]. */ + int raw_mbs_idx; /* Store the multibyte string. In case of "case insensitive mode" like - REG_ICASE, upper cases of the string are stored. */ - const unsigned char *mbs; + REG_ICASE, upper cases of the string are stored, otherwise MBS points + the same address that RAW_MBS points. */ + unsigned char *mbs; /* Store the case sensitive multibyte string. In case of "case insensitive mode", the original string are stored, otherwise MBS_CASE points the same address that MBS points. */ - const unsigned char *mbs_case; - int cur_idx; - int len; + unsigned char *mbs_case; #ifdef RE_ENABLE_I18N /* Store the wide character string which is corresponding to MBS. */ wchar_t *wcs; + mbstate_t cur_state; #endif - /* 1 if mbs is allocated by regex library. */ - unsigned int mbs_alloc : 1; - /* 1 if mbs_case is allocated by regex library. */ - unsigned int mbs_case_alloc : 1; + /* The length of the valid characters in the buffers. */ + int valid_len; + /* The length of the buffers MBS, MBS_CASE, and WCS. */ + int bufs_len; + /* The index in MBS, which is updated by re_string_fetch_byte. */ + int cur_idx; + /* This is length_of_RAW_MBS - RAW_MBS_IDX. */ + int len; + /* The context of mbs[0]. We store the context independently, since + the context of mbs[0] may be different from raw_mbs[0], which is + the beginning of the input string. */ + unsigned int tip_context; + /* The translation passed as a part of an argument of re_compile_pattern. */ + RE_TRANSLATE_TYPE trans; + /* 1 if REG_ICASE. */ + unsigned int icase : 1; }; typedef struct re_string_t re_string_t; +/* In case of REG_ICASE, we allocate the buffer dynamically for mbs. */ +#define MBS_ALLOCATED(pstr) (pstr->icase) +/* In case that we need translation, we allocate the buffer dynamically + for mbs_case. Note that mbs == mbs_case if not REG_ICASE. */ +#define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL) + +static reg_errcode_t re_string_allocate (re_string_t *pstr, + const unsigned char *str, int len, + int init_len, + RE_TRANSLATE_TYPE trans, int icase); static reg_errcode_t re_string_construct (re_string_t *pstr, const unsigned char *str, int len, - RE_TRANSLATE_TYPE trans); -static reg_errcode_t re_string_construct_toupper (re_string_t *pstr, - const unsigned char *str, - int len, - RE_TRANSLATE_TYPE trans); + RE_TRANSLATE_TYPE trans, int icase); +static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx, + int eflags, int newline); +static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, + int new_buf_len); +#ifdef RE_ENABLE_I18N +static void build_wcs_buffer (re_string_t *pstr); +static void build_wcs_upper_buffer (re_string_t *pstr); +#endif /* RE_ENABLE_I18N */ +static void build_upper_buffer (re_string_t *pstr); +static void re_string_translate_buffer (re_string_t *pstr); static void re_string_destruct (re_string_t *pstr); #ifdef RE_ENABLE_I18N static int re_string_elem_size_at (const re_string_t *pstr, int idx); @@ -253,8 +287,7 @@ static unsigned int re_string_context_at (const re_string_t *input, int idx, #define re_string_cur_idx(pstr) ((pstr)->cur_idx) #define re_string_get_buffer(pstr) ((pstr)->mbs) #define re_string_length(pstr) ((pstr)->len) -#define re_string_byte_at(pstr,idx) \ - ((pstr)->mbs[idx]) +#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx]) #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx)) #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx)) @@ -279,27 +312,6 @@ struct bin_tree_t }; typedef struct bin_tree_t bin_tree_t; -struct re_backref_cache_entry -{ - int node; - int from; - int to; - int flag; -}; - -typedef struct -{ - int eflags; - int match_first; - int match_last; - int state_log_top; - /* Back reference cache. */ - int nbkref_ents; - int abkref_ents; - struct re_backref_cache_entry *bkref_ents; - int max_bkref_len; -} re_match_context_t; - #define CONTEXT_WORD 1 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1) @@ -363,6 +375,32 @@ struct re_state_table_entry re_dfastate_t **array; }; +struct re_backref_cache_entry +{ + int node; + int from; + int to; + int flag; +}; + +typedef struct +{ + /* EFLAGS of the argument of regexec. */ + int eflags; + /* Where the matching ends. */ + int match_last; + /* The string object corresponding to the input string. */ + re_string_t *input; + /* The state log used by the matcher. */ + re_dfastate_t **state_log; + int state_log_top; + /* Back reference cache. */ + int nbkref_ents; + int abkref_ents; + struct re_backref_cache_entry *bkref_ents; + int max_bkref_len; +} re_match_context_t; + struct re_dfa_t { re_bitset_ptr_t word_char; diff --git a/posix/regexec.c b/posix/regexec.c index a069d7d..e888970 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -39,7 +39,7 @@ #include "regex_internal.h" static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, - int n); + re_string_t *input, int n); static void match_ctx_free (re_match_context_t *cache); static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, int from, int to); @@ -49,68 +49,54 @@ static reg_errcode_t re_search_internal (const regex_t *preg, regmatch_t pmatch[], int eflags); static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err, const regex_t *preg, - const re_string_t *input, - int idx, int eflags); -static int check_matching (const regex_t *preg, re_string_t *input, - re_match_context_t *mctx, re_dfastate_t **state_log, - int start_idx, int fl_search, int fl_longest_match); + const re_match_context_t *mctx, + int idx); +static int check_matching (const regex_t *preg, re_match_context_t *mctx, + int fl_search, int fl_longest_match); static int check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context); static int check_halt_state_context (const regex_t *preg, const re_dfastate_t *state, - const re_string_t *input, int idx, - int eflags); + const re_match_context_t *mctx, int idx); static int proceed_next_node (const regex_t *preg, - re_dfastate_t **state_log, const re_match_context_t *mctx, - const re_string_t *input, int *pidx, int node, re_node_set *eps_via_nodes); -static reg_errcode_t set_regs (const regex_t *preg, re_dfastate_t **state_log, +static reg_errcode_t set_regs (const regex_t *preg, const re_match_context_t *mctx, - const re_string_t *input, size_t nmatch, - regmatch_t *pmatch, int last); -static int sift_states_iter_mb (const regex_t *preg, re_dfastate_t **state_log, + size_t nmatch, regmatch_t *pmatch, int last); +static int sift_states_iter_mb (const regex_t *preg, const re_match_context_t *mctx, - const re_string_t *input, int node_idx, - int str_idx, int max_str_idx); + int node_idx, int str_idx, int max_str_idx); static int sift_states_iter_bkref (const re_dfa_t *dfa, re_dfastate_t **state_log, struct re_backref_cache_entry *mctx_entry, - int node_idx, int idx, int match_first, - int match_last); + int node_idx, int idx, int match_last); static reg_errcode_t sift_states_backward (const regex_t *preg, - re_dfastate_t **state_log, const re_match_context_t *mctx, - const re_string_t *input, int last_node); +static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx, + int next_state_log_idx); static reg_errcode_t add_epsilon_backreference (const re_dfa_t *dfa, const re_match_context_t *mctx, const re_node_set *plog, int idx, re_node_set *state_buf); static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg, - re_dfastate_t *state, re_string_t *input, - int fl_search, re_dfastate_t **state_log, - re_match_context_t *mctx); + re_match_context_t *mctx, + re_dfastate_t *state, int fl_search); static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg, re_dfastate_t *pstate, - re_string_t *input, int fl_search, + int fl_search, re_match_context_t *mctx); static reg_errcode_t transit_state_mb (const regex_t *preg, re_dfastate_t *pstate, - const re_string_t *input, - re_dfastate_t **state_log, re_match_context_t *mctx); static reg_errcode_t transit_state_bkref (const regex_t *preg, re_dfastate_t *pstate, - const re_string_t *input, - re_dfastate_t **state_log, re_match_context_t *mctx); static reg_errcode_t transit_state_bkref_loop (const regex_t *preg, - const re_string_t *input, re_node_set *nodes, re_dfastate_t **work_state_log, - re_dfastate_t **state_log, re_match_context_t *mctx); static re_dfastate_t **build_trtable (const regex_t *dfa, const re_dfastate_t *state, @@ -124,7 +110,8 @@ static int group_nodes_into_DFAstates (const regex_t *dfa, re_node_set *states_node, bitset *states_ch); static int check_node_accept (const regex_t *preg, const re_token_t *node, - const re_string_t *input, int idx, int eflags); + const re_match_context_t *mctx, int idx); +static reg_errcode_t extend_buffers (re_match_context_t *mctx); /* Entry point for POSIX code. */ @@ -523,7 +510,7 @@ re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) reg_errcode_t err; re_dfa_t *dfa = (re_dfa_t *)preg->buffer; re_string_t input; - re_dfastate_t **state_log; + int left_lim, right_lim, incr; int fl_longest_match, match_first, match_last = -1; re_match_context_t mctx; char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate) @@ -540,53 +527,91 @@ re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) /* We must check the longest matching, if nmatch > 0. */ fl_longest_match = (nmatch != 0); + err = re_string_allocate (&input, string, length, dfa->nodes_len + 1, + preg->translate, preg->syntax & RE_ICASE); + if (BE (err != REG_NOERROR, 0)) + return err; + + err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2); + if (BE (err != REG_NOERROR, 0)) + return err; + /* We will log all the DFA states through which the dfa pass, if nmatch > 1, or this dfa has "multibyte node", which is a back-reference or a node which can accept multibyte character or multi character collating element. */ if (nmatch > 1 || dfa->has_mb_node) { - state_log = re_malloc (re_dfastate_t *, length + 1); - if (BE (state_log == NULL, 0)) + mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1); + if (BE (mctx.state_log == NULL, 0)) return REG_ESPACE; } else - state_log = NULL; - - if (preg->syntax & RE_ICASE) - err = re_string_construct_toupper (&input, string, length, preg->translate); - else - err = re_string_construct (&input, string, length, preg->translate); - if (BE (err != REG_NOERROR, 0)) - return err; - - err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); - if (BE (err != REG_NOERROR, 0)) - return err; + mctx.state_log = NULL; #ifdef DEBUG /* We assume front-end functions already check them. */ assert (start + range >= 0 && start + range <= length); #endif + match_first = start; + input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF + : CONTEXT_NEWLINE | CONTEXT_BEGBUF); + /* Check incrementally whether of not the input string match. */ - for (match_first = start; ;) + incr = (range < 0) ? -1 : 1; + left_lim = (range < 0) ? start + range : start; + right_lim = (range < 0) ? start : start + range; + + for (;;) { - if ((match_first < length - && (fastmap == NULL - || fastmap[re_string_byte_at (&input, match_first)])) - || preg->can_be_null) + /* At first get the current byte from input string. */ + int ch; + if (MB_CUR_MAX > 1 && (preg->syntax & RE_ICASE || preg->translate)) + { + /* In this case, we can't determin easily the current byte, + since it might be a component byte of a multibyte character. + Then we use the constructed buffer instead. */ + /* If MATCH_FIRST is out of the valid range, reconstruct the + buffers. */ + if (input.raw_mbs_idx + input.valid_len <= match_first) + re_string_reconstruct (&input, match_first, eflags, + preg->newline_anchor); + /* If MATCH_FIRST is out of the buffer, leave it as '\0'. + Note that MATCH_FIRST must not be smaller than 0. */ + ch = ((match_first >= length) ? 0 + : re_string_byte_at (&input, match_first - input.raw_mbs_idx)); + } + else + { + /* We apply translate/conversion manually, since it is trivial + in this case. */ + /* If MATCH_FIRST is out of the buffer, leave it as '\0'. + Note that MATCH_FIRST must not be smaller than 0. */ + ch = (match_first < length) ? (unsigned char)string[match_first] : 0; + /* Apply translation if we need. */ + ch = preg->translate ? preg->translate[ch] : ch; + /* In case of case insensitive mode, convert to upper case. */ + ch = ((preg->syntax & RE_ICASE) && islower (ch)) ? toupper (ch) : ch; + } + + /* Eliminate inappropriate one by fastmap. */ + if (preg->can_be_null || fastmap == NULL || fastmap[ch]) { + /* Reconstruct the buffers so that the matcher can assume that + the matching starts from the begining of the buffer. */ + re_string_reconstruct (&input, match_first, eflags, + preg->newline_anchor); #ifdef RE_ENABLE_I18N - if (MB_CUR_MAX == 1 || re_string_first_byte (&input, match_first)) + /* Eliminate it when it is a component of a multibyte character + and isn't the head of a multibyte character. */ + if (MB_CUR_MAX == 1 || re_string_first_byte (&input, 0)) #endif { - /* We assume that the matching starts from `match_first'. */ - re_string_set_index (&input, match_first); - mctx.match_first = mctx.state_log_top = match_first; - mctx.nbkref_ents = mctx.max_bkref_len = 0; - match_last = check_matching (preg, &input, &mctx, state_log, - match_first, 0, fl_longest_match); + /* It seems to be appropriate one, then use the matcher. */ + /* We assume that the matching starts from 0. */ + mctx.state_log_top = mctx.nbkref_ents = mctx.max_bkref_len = 0; + match_last = check_matching (preg, &mctx, 0, fl_longest_match); if (match_last != -1) { if (BE (match_last == -2, 0)) @@ -597,18 +622,9 @@ re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) } } /* Update counter. */ - if (range < 0) - { - --match_first; - if (match_first < start + range) - break; - } - else - { - ++match_first; - if (match_first > start + range) - break; - } + match_first += incr; + if (match_first < left_lim || right_lim < match_first) + break; } /* Set pmatch[] if we need. */ @@ -621,30 +637,38 @@ re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; /* Set the points where matching start/end. */ - pmatch[0].rm_so = mctx.match_first; + pmatch[0].rm_so = 0; mctx.match_last = pmatch[0].rm_eo = match_last; if (!preg->no_sub && nmatch > 1) { /* We need the ranges of all the subexpressions. */ int halt_node; - re_dfastate_t *pstate = state_log[match_last]; + re_dfastate_t *pstate = mctx.state_log[match_last]; #ifdef DEBUG - assert (state_log != NULL); + assert (mctx.state_log != NULL); #endif - halt_node = check_halt_state_context (preg, pstate, &input, - match_last, eflags); - err = sift_states_backward (preg, state_log, &mctx, &input, halt_node); + halt_node = check_halt_state_context (preg, pstate, &mctx, + match_last); + err = sift_states_backward (preg, &mctx, halt_node); if (BE (err != REG_NOERROR, 0)) return err; - err = set_regs (preg, state_log, &mctx, &input, nmatch, pmatch, - halt_node); + err = set_regs (preg, &mctx, nmatch, pmatch, halt_node); if (BE (err != REG_NOERROR, 0)) return err; } + + /* At last, add the offset to the each registers, since we slided + the buffers so that We can assume that the matching starts from 0. */ + for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) + if (pmatch[reg_idx].rm_so != -1) + { + pmatch[reg_idx].rm_so += match_first; + pmatch[reg_idx].rm_eo += match_first; + } } - re_free (state_log); + re_free (mctx.state_log); if (dfa->nbackref) match_ctx_free (&mctx); re_string_destruct (&input); @@ -656,11 +680,11 @@ re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) since initial states may have constraints like "\<", "^", etc.. */ static inline re_dfastate_t * -acquire_init_state_context (err, preg, input, idx, eflags) +acquire_init_state_context (err, preg, mctx, idx) reg_errcode_t *err; const regex_t *preg; - const re_string_t *input; - int idx, eflags; + const re_match_context_t *mctx; + int idx; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; @@ -668,7 +692,7 @@ acquire_init_state_context (err, preg, input, idx, eflags) if (dfa->init_state->has_constraint) { unsigned int context; - context = re_string_context_at (input, idx - 1, eflags, + context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, preg->newline_anchor); if (IS_WORD_CONTEXT (context)) return dfa->init_state_word; @@ -697,52 +721,51 @@ acquire_init_state_context (err, preg, input, idx, eflags) and return the index where the matching end, return -1 if not match, or return -2 in case of an error. FL_SEARCH means we must search where the matching starts, - FL_LONGEST_MATCH means we want the POSIX longest matching. */ + FL_LONGEST_MATCH means we want the POSIX longest matching. + Note that the matcher assume that the maching starts from the current + index of the buffer. */ static int -check_matching (preg, input, mctx, state_log, start_idx, fl_search, - fl_longest_match) +check_matching (preg, mctx, fl_search, fl_longest_match) const regex_t *preg; - re_string_t *input; re_match_context_t *mctx; - re_dfastate_t **state_log; - int start_idx, fl_search, fl_longest_match; + int fl_search, fl_longest_match; { reg_errcode_t err; - int match = 0, match_last = -1; + int match = 0; + int match_last = -1; + int cur_str_idx = re_string_cur_idx (mctx->input); re_dfastate_t *cur_state; - cur_state = acquire_init_state_context (&err, preg, input, start_idx, - mctx->eflags); + cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx); /* An initial state must not be NULL(invalid state). */ if (BE (cur_state == NULL, 0)) return -2; - if (state_log != NULL) - state_log[start_idx] = cur_state; + if (mctx->state_log != NULL) + mctx->state_log[cur_str_idx] = cur_state; /* If the RE accepts NULL string. */ if (cur_state->halt) { if (!cur_state->has_constraint - || check_halt_state_context (preg, cur_state, input, start_idx, - mctx->eflags)) + || check_halt_state_context (preg, cur_state, mctx, cur_str_idx)) { if (!fl_longest_match) - return start_idx; + return cur_str_idx; else { - match_last = start_idx; + match_last = cur_str_idx; match = 1; } } } - while (!re_string_eoi (input)) + while (!re_string_eoi (mctx->input)) { - cur_state = transit_state (&err, preg, cur_state, input, - fl_search && !match, state_log, mctx); + cur_state = transit_state (&err, preg, mctx, cur_state, + fl_search && !match); if (cur_state == NULL) /* Reached at the invalid state or an error. */ { - int cur_str_idx = re_string_cur_idx (input); + cur_str_idx = re_string_cur_idx (mctx->input); if (BE (err != REG_NOERROR, 0)) return -2; if (fl_search && !match) @@ -750,27 +773,27 @@ check_matching (preg, input, mctx, state_log, start_idx, fl_search, /* Restart from initial state, since we are searching the point from where matching start. */ #ifdef RE_ENABLE_I18N - if (MB_CUR_MAX == 1 || re_string_first_byte (input, cur_str_idx)) + if (MB_CUR_MAX == 1 + || re_string_first_byte (mctx->input, cur_str_idx)) #endif /* RE_ENABLE_I18N */ - cur_state = acquire_init_state_context (&err, preg, input, - cur_str_idx, - mctx->eflags); + cur_state = acquire_init_state_context (&err, preg, mctx, + cur_str_idx); if (BE (cur_state == NULL && err != REG_NOERROR, 0)) return -2; - if (state_log != NULL) - state_log[cur_str_idx] = cur_state; + if (mctx->state_log != NULL) + mctx->state_log[cur_str_idx] = cur_state; } else if (!fl_longest_match && match) break; else /* (fl_longest_match && match) || (!fl_search && !match) */ { - if (state_log == NULL) + if (mctx->state_log == NULL) break; else { int max = mctx->state_log_top; for (; cur_str_idx <= max; ++cur_str_idx) - if (state_log[cur_str_idx] != NULL) + if (mctx->state_log[cur_str_idx] != NULL) break; if (cur_str_idx > max) break; @@ -783,12 +806,11 @@ check_matching (preg, input, mctx, state_log, start_idx, fl_search, /* Reached at a halt state. Check the halt state can satisfy the current context. */ if (!cur_state->has_constraint - || check_halt_state_context (preg, cur_state, input, - re_string_cur_idx (input), - mctx->eflags)) + || check_halt_state_context (preg, cur_state, mctx, + re_string_cur_idx (mctx->input))) { /* We found an appropriate halt state. */ - match_last = re_string_cur_idx (input); + match_last = re_string_cur_idx (mctx->input); match = 1; if (!fl_longest_match) break; @@ -823,11 +845,11 @@ static int check_halt_node_context (dfa, node, context) match the context, return the node. */ static int -check_halt_state_context (preg, state, input, idx, eflags) +check_halt_state_context (preg, state, mctx, idx) const regex_t *preg; const re_dfastate_t *state; - const re_string_t *input; - int idx, eflags; + const re_match_context_t *mctx; + int idx; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; int i; @@ -835,7 +857,8 @@ check_halt_state_context (preg, state, input, idx, eflags) #ifdef DEBUG assert (state->halt); #endif - context = re_string_context_at (input, idx, eflags, preg->newline_anchor); + context = re_string_context_at (mctx->input, idx, mctx->eflags, + preg->newline_anchor); for (i = 0; i < state->nodes.nelem; ++i) if (check_halt_node_context (dfa, state->nodes.elems[i], context)) return state->nodes.elems[i]; @@ -848,11 +871,9 @@ check_halt_state_context (preg, state, input, idx, eflags) of errors. */ static int -proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) +proceed_next_node (preg, mctx, pidx, node, eps_via_nodes) const regex_t *preg; - re_dfastate_t **state_log; const re_match_context_t *mctx; - const re_string_t *input; int *pidx, node; re_node_set *eps_via_nodes; { @@ -863,9 +884,9 @@ proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) err = re_node_set_insert (eps_via_nodes, node); if (BE (err < 0, 0)) return -1; - for (i = 0; i < state_log[*pidx]->nodes.nelem; ++i) + for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i) { - int candidate = state_log[*pidx]->nodes.elems[i]; + int candidate = mctx->state_log[*pidx]->nodes.elems[i]; if (!re_node_set_contains (dfa->edests + node, candidate) && !(dfa->nodes[candidate].type == OP_CONTEXT_NODE && re_node_set_contains (dfa->edests + node, @@ -892,7 +913,7 @@ proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) } if (ACCEPT_MB_NODE (type)) - naccepted = check_node_accept_bytes (preg, entity, input, *pidx); + naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx); else if (type == OP_BACK_REF) { for (i = 0; i < mctx->nbkref_ents; ++i) @@ -907,11 +928,12 @@ proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) if (BE (err < 0, 0)) return -1; dest_node = dfa->nexts[node]; - if (re_node_set_contains (&state_log[*pidx]->nodes, dest_node)) + if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, + dest_node)) return dest_node; - for (i = 0; i < state_log[*pidx]->nodes.nelem; ++i) + for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i) { - dest_node = state_log[*pidx]->nodes.elems[i]; + dest_node = mctx->state_log[*pidx]->nodes.elems[i]; if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE && (dfa->nexts[node] == dfa->nodes[dest_node].opr.ctx_info->entity))) @@ -921,13 +943,12 @@ proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) } if (naccepted != 0 - || check_node_accept (preg, dfa->nodes + node, input, *pidx, - mctx->eflags)) + || check_node_accept (preg, dfa->nodes + node, mctx, *pidx)) { dest_node = dfa->nexts[node]; *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; #ifdef DEBUG - assert (state_log[*pidx] != NULL); + assert (mctx->state_log[*pidx] != NULL); #endif re_node_set_empty (eps_via_nodes); return dest_node; @@ -946,11 +967,9 @@ proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */ static reg_errcode_t -set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) +set_regs (preg, mctx, nmatch, pmatch, last_node) const regex_t *preg; - re_dfastate_t **state_log; const re_match_context_t *mctx; - const re_string_t *input; size_t nmatch; regmatch_t *pmatch; int last_node; @@ -961,7 +980,7 @@ set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) int i; #ifdef DEBUG assert (nmatch > 1); - assert (state_log != NULL); + assert (mctx->state_log != NULL); #endif cur_node = dfa->init_node; real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1; @@ -1002,8 +1021,7 @@ set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) break; /* Proceed to next node. */ - cur_node = proceed_next_node (preg, state_log, mctx, input, &idx, - cur_node, &eps_via_nodes); + cur_node = proceed_next_node (preg, mctx, &idx, cur_node, &eps_via_nodes); if (BE (cur_node < 0, 0)) return REG_ESPACE; } @@ -1013,15 +1031,15 @@ set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) #define NUMBER_OF_STATE 1 -/* This function checks the STATE_LOG from the MCTX->match_last - to MCTX->match_first and sift the nodes in each states according to - the following rules. Updated state_log will be wrote to STATE_LOG. +/* This function checks the STATE_LOG from the MCTX->match_last to 0 + and sift the nodes in each states according to the following rules. + Updated state_log will be wrote to STATE_LOG. Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1. When STR_IDX == MATCH_LAST(the last index in the state_log): If `a' isn't the LAST_NODE and `a' can't epsilon transit to the LAST_NODE, we throw away the node `a'. - 2. When MATCH_FIRST <= STR_IDX < MATCH_LAST and `a' accepts + 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts string `s' and transit to `b': i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw away the node `a'. @@ -1037,11 +1055,9 @@ set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) static reg_errcode_t -sift_states_backward (preg, state_log, mctx, input, last_node) +sift_states_backward (preg, mctx, last_node) const regex_t *preg; - re_dfastate_t **state_log; const re_match_context_t *mctx; - const re_string_t *input; int last_node; { reg_errcode_t err; @@ -1051,12 +1067,12 @@ sift_states_backward (preg, state_log, mctx, input, last_node) re_node_set *plog; /* Points the state_log[str_idx]->nodes */ #ifdef DEBUG - assert (state_log != NULL && state_log[str_idx] != NULL); + assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); #endif err = re_node_set_alloc (&state_buf, NUMBER_OF_STATE); if (BE (err != REG_NOERROR, 0)) return err; - plog = &state_log[str_idx]->nodes; + plog = &mctx->state_log[str_idx]->nodes; /* Build sifted state_log[str_idx]. It has the nodes which can epsilon transit to the last_node and the last_node itself. */ @@ -1064,7 +1080,8 @@ sift_states_backward (preg, state_log, mctx, input, last_node) if (BE (err != REG_NOERROR, 0)) return err; - if (state_log[str_idx] != NULL && state_log[str_idx]->has_backref) + if (mctx->state_log[str_idx] != NULL + && mctx->state_log[str_idx]->has_backref) { err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf); if (BE (err != REG_NOERROR, 0)) @@ -1072,19 +1089,19 @@ sift_states_backward (preg, state_log, mctx, input, last_node) } /* Update state log. */ - state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf); - if (BE (state_log[str_idx] == NULL && err != REG_NOERROR, 0)) + mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf); + if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0)) return err; /* Then check each states in the state_log. */ - while (str_idx > mctx->match_first) + while (str_idx > 0) { int i, j; /* Update counters. */ re_node_set_empty (&state_buf); --str_idx; - plog = ((state_log[str_idx] == NULL) ? &empty_set - : &state_log[str_idx]->nodes); + plog = ((mctx->state_log[str_idx] == NULL) ? &empty_set + : &mctx->state_log[str_idx]->nodes); /* Then build the next sifted state. We build the next sifted state on `state_buf', and update @@ -1106,27 +1123,25 @@ sift_states_backward (preg, state_log, mctx, input, last_node) /* If the node may accept `multi byte'. */ if (ACCEPT_MB_NODE (type)) - naccepted = sift_states_iter_mb (preg, state_log, mctx, input, - entity, str_idx, + naccepted = sift_states_iter_mb (preg, mctx, entity, str_idx, mctx->match_last); /* If the node is a back reference. */ else if (type == OP_BACK_REF) for (j = 0; j < mctx->nbkref_ents; ++j) { - naccepted = sift_states_iter_bkref (dfa, state_log, + naccepted = sift_states_iter_bkref (dfa, mctx->state_log, mctx->bkref_ents + j, prev_node, str_idx, - mctx->match_first, mctx->match_last); if (naccepted) break; } if (!naccepted - && check_node_accept (preg, dfa->nodes + prev_node, input, - str_idx, mctx->eflags) - && STATE_NODE_CONTAINS (state_log[str_idx + 1], + && check_node_accept (preg, dfa->nodes + prev_node, mctx, + str_idx) + && STATE_NODE_CONTAINS (mctx->state_log[str_idx + 1], dfa->nexts[prev_node])) naccepted = 1; @@ -1140,7 +1155,8 @@ sift_states_backward (preg, state_log, mctx, input, last_node) if (BE (err != REG_NOERROR, 0)) return err; } - if (state_log[str_idx] != NULL && state_log[str_idx]->has_backref) + if (mctx->state_log[str_idx] != NULL + && mctx->state_log[str_idx]->has_backref) { err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf); if (BE (err != REG_NOERROR, 0)) @@ -1148,8 +1164,8 @@ sift_states_backward (preg, state_log, mctx, input, last_node) } /* Update state_log. */ - state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf); - if (BE (state_log[str_idx] == NULL && err != REG_NOERROR, 0)) + mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf); + if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0)) return err; } @@ -1159,36 +1175,44 @@ sift_states_backward (preg, state_log, mctx, input, last_node) /* Helper functions. */ -static inline void -clean_state_log_if_need (state_log, mctx, next_state_log_idx) - re_dfastate_t **state_log; +static inline reg_errcode_t +clean_state_log_if_need (mctx, next_state_log_idx) re_match_context_t *mctx; int next_state_log_idx; { int top = mctx->state_log_top; + + if (next_state_log_idx >= mctx->input->bufs_len + || (next_state_log_idx >= mctx->input->valid_len + && mctx->input->valid_len < mctx->input->len)) + { + reg_errcode_t err; + err = extend_buffers (mctx); + if (BE (err != REG_NOERROR, 0)) + return err; + } + if (top < next_state_log_idx) { - memset (state_log + top + 1, '\0', + memset (mctx->state_log + top + 1, '\0', sizeof (re_dfastate_t *) * (next_state_log_idx - top)); mctx->state_log_top = next_state_log_idx; } + return REG_NOERROR; } static int -sift_states_iter_mb (preg, state_log, mctx, input, node_idx, str_idx, - max_str_idx) +sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx) const regex_t *preg; - re_dfastate_t **state_log; const re_match_context_t *mctx; - const re_string_t *input; int node_idx, str_idx, max_str_idx; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; int naccepted; /* Check the node can accept `multi byte'. */ - naccepted = check_node_accept_bytes (preg, node_idx, input, str_idx); + naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx); if (naccepted > 0 && str_idx + naccepted <= max_str_idx && - !STATE_NODE_CONTAINS (state_log[str_idx + naccepted], + !STATE_NODE_CONTAINS (mctx->state_log[str_idx + naccepted], dfa->nexts[node_idx])) /* The node can't accept the `multi byte', or the destination was already throwed away, then the node @@ -1200,12 +1224,11 @@ sift_states_iter_mb (preg, state_log, mctx, input, node_idx, str_idx, } static int -sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_first, - match_last) +sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_last) const re_dfa_t *dfa; re_dfastate_t **state_log; struct re_backref_cache_entry *mctx_entry; - int node_idx, idx, match_first, match_last; + int node_idx, idx, match_last; { int naccepted = 0; int from_idx, to_idx; @@ -1244,7 +1267,7 @@ add_epsilon_backreference (dfa, mctx, plog, idx, state_buf) if (entry->from == entry->to && entry->from == idx) break; } - if (j < mctx->nbkref_ents || idx == mctx->match_first) + if (j < mctx->nbkref_ents || idx == 0) { reg_errcode_t err; err = re_node_set_add_intersect (state_buf, plog, @@ -1266,30 +1289,38 @@ add_epsilon_backreference (dfa, mctx, plog, idx, state_buf) update the destination of STATE_LOG. */ static re_dfastate_t * -transit_state (err, preg, state, input, fl_search, state_log, mctx) +transit_state (err, preg, mctx, state, fl_search) reg_errcode_t *err; const regex_t *preg; - re_dfastate_t *state, **state_log; - re_string_t *input; - int fl_search; re_match_context_t *mctx; + re_dfastate_t *state; + int fl_search; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfastate_t **trtable, *next_state; unsigned char ch; + if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len + || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len + && mctx->input->valid_len < mctx->input->len)) + { + *err = extend_buffers (mctx); + if (BE (*err != REG_NOERROR, 0)) + return NULL; + } + *err = REG_NOERROR; if (state == NULL) { next_state = state; - re_string_skip_bytes (input, 1); + re_string_skip_bytes (mctx->input, 1); } else { /* If the current state can accept multibyte. */ if (state->accept_mb) { - *err = transit_state_mb (preg, state, input, state_log, mctx); + *err = transit_state_mb (preg, state, mctx); if (BE (*err != REG_NOERROR, 0)) return NULL; } @@ -1298,7 +1329,7 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) if (1) { /* Use transition table */ - ch = re_string_fetch_byte (input); + ch = re_string_fetch_byte (mctx->input); trtable = fl_search ? state->trtable_search : state->trtable; if (trtable == NULL) { @@ -1313,25 +1344,24 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) else { /* don't use transition table */ - next_state = transit_state_sb (err, preg, state, input, fl_search, - mctx); + next_state = transit_state_sb (err, preg, state, fl_search, mctx); if (BE (next_state == NULL && err != REG_NOERROR, 0)) return NULL; } } /* Update the state_log if we need. */ - if (state_log != NULL) + if (mctx->state_log != NULL) { - int cur_idx = re_string_cur_idx (input); + int cur_idx = re_string_cur_idx (mctx->input); if (cur_idx > mctx->state_log_top) { - state_log[cur_idx] = next_state; + mctx->state_log[cur_idx] = next_state; mctx->state_log_top = cur_idx; } - else if (state_log[cur_idx] == 0) + else if (mctx->state_log[cur_idx] == 0) { - state_log[cur_idx] = next_state; + mctx->state_log[cur_idx] = next_state; } else { @@ -1342,7 +1372,7 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) the destination of a multibyte char/collating element/ back reference. Then the next state is the union set of these destinations and the results of the transition table. */ - pstate = state_log[cur_idx]; + pstate = mctx->state_log[cur_idx]; log_nodes = pstate->entrance_nodes; if (next_state != NULL) { @@ -1357,9 +1387,10 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) /* Note: We already add the nodes of the initial state, then we don't need to add them here. */ - context = re_string_context_at (input, re_string_cur_idx (input) - 1, + context = re_string_context_at (mctx->input, + re_string_cur_idx (mctx->input) - 1, mctx->eflags, preg->newline_anchor); - next_state = state_log[cur_idx] + next_state = mctx->state_log[cur_idx] = re_acquire_state_context (err, dfa, &next_nodes, context); /* We don't need to check errors here, since the return value of this function is next_state and ERR is already set. */ @@ -1370,10 +1401,10 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) /* If the next state has back references. */ if (next_state != NULL && next_state->has_backref) { - *err = transit_state_bkref (preg, next_state, input, state_log, mctx); + *err = transit_state_bkref (preg, next_state, mctx); if (BE (*err != REG_NOERROR, 0)) return NULL; - next_state = state_log[cur_idx]; + next_state = mctx->state_log[cur_idx]; } } return next_state; @@ -1385,18 +1416,17 @@ transit_state (err, preg, state, input, fl_search, state_log, mctx) accepting the current input byte. */ static re_dfastate_t * -transit_state_sb (err, preg, state, input, fl_search, mctx) +transit_state_sb (err, preg, state, fl_search, mctx) reg_errcode_t *err; const regex_t *preg; re_dfastate_t *state; - re_string_t *input; int fl_search; re_match_context_t *mctx; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_node_set next_nodes; re_dfastate_t *next_state; - int node_cnt, cur_str_idx = re_string_cur_idx (input); + int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input); unsigned int context; *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); @@ -1405,8 +1435,7 @@ transit_state_sb (err, preg, state, input, fl_search, mctx) for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) { int cur_node = state->nodes.elems[node_cnt]; - if (check_node_accept (preg, dfa->nodes + cur_node, input, - cur_str_idx, mctx->eflags)) + if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx)) { *err = re_node_set_merge (&next_nodes, dfa->eclosures + dfa->nexts[cur_node]); @@ -1434,22 +1463,21 @@ transit_state_sb (err, preg, state, input, fl_search, mctx) return NULL; } } - context = re_string_context_at (input, cur_str_idx, mctx->eflags, + context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags, preg->newline_anchor); next_state = re_acquire_state_context (err, dfa, &next_nodes, context); /* We don't need to check errors here, since the return value of this function is next_state and ERR is already set. */ re_node_set_free (&next_nodes); - re_string_skip_bytes (input, 1); + re_string_skip_bytes (mctx->input, 1); return next_state; } static reg_errcode_t -transit_state_mb (preg, pstate, input, state_log, mctx) +transit_state_mb (preg, pstate, mctx) const regex_t *preg; - re_dfastate_t *pstate, **state_log; - const re_string_t *input; + re_dfastate_t *pstate; re_match_context_t *mctx; { reg_errcode_t err; @@ -1466,7 +1494,8 @@ transit_state_mb (preg, pstate, input, state_log, mctx) if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE) { - context = re_string_context_at (input, re_string_cur_idx (input), + context = re_string_context_at (mctx->input, + re_string_cur_idx (mctx->input), mctx->eflags, preg->newline_anchor); if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, context)) @@ -1476,14 +1505,16 @@ transit_state_mb (preg, pstate, input, state_log, mctx) /* How many bytes the node can accepts? */ if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type)) - naccepted = check_node_accept_bytes (preg, cur_node_idx, input, - re_string_cur_idx (input)); + naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input, + re_string_cur_idx (mctx->input)); if (naccepted == 0) continue; /* The node can accepts `naccepted' bytes. */ - dest_idx = re_string_cur_idx (input) + naccepted; - clean_state_log_if_need (state_log, mctx, dest_idx); + dest_idx = re_string_cur_idx (mctx->input) + naccepted; + err = clean_state_log_if_need (mctx, dest_idx); + if (BE (err != REG_NOERROR, 0)) + return err; #ifdef DEBUG assert (dfa->nexts[cur_node_idx] != -1); #endif @@ -1491,7 +1522,7 @@ transit_state_mb (preg, pstate, input, state_log, mctx) then we use pstate->nodes.elems[i] instead. */ new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]]; - dest_state = state_log[dest_idx]; + dest_state = mctx->state_log[dest_idx]; if (dest_state == NULL) dest_nodes = *new_nodes; else @@ -1501,11 +1532,11 @@ transit_state_mb (preg, pstate, input, state_log, mctx) if (BE (err != REG_NOERROR, 0)) return err; } - context = re_string_context_at (input, dest_idx - 1, mctx->eflags, + context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags, preg->newline_anchor); - state_log[dest_idx] = re_acquire_state_context (&err, dfa, &dest_nodes, - context); - if (BE (state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) + mctx->state_log[dest_idx] + = re_acquire_state_context (&err, dfa, &dest_nodes, context); + if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) return err; if (dest_state != NULL) re_node_set_free (&dest_nodes); @@ -1514,24 +1545,20 @@ transit_state_mb (preg, pstate, input, state_log, mctx) } static reg_errcode_t -transit_state_bkref (preg, pstate, input, state_log, mctx) +transit_state_bkref (preg, pstate, mctx) const regex_t *preg; - re_dfastate_t *pstate, **state_log; - const re_string_t *input; + re_dfastate_t *pstate; re_match_context_t *mctx; { reg_errcode_t err; re_dfastate_t **work_state_log; -#ifdef DEBUG - assert (mctx->match_first != -1); -#endif - work_state_log = re_malloc (re_dfastate_t *, re_string_cur_idx (input) + 1); + work_state_log = re_malloc (re_dfastate_t *, + re_string_cur_idx (mctx->input) + 1); if (BE (work_state_log == NULL, 0)) return REG_ESPACE; - err = transit_state_bkref_loop (preg, input, &pstate->nodes, work_state_log, - state_log, mctx); + err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx); re_free (work_state_log); return err; } @@ -1539,23 +1566,24 @@ transit_state_bkref (preg, pstate, input, state_log, mctx) /* Caller must allocate `work_state_log'. */ static reg_errcode_t -transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) +transit_state_bkref_loop (preg, nodes, work_state_log, mctx) const regex_t *preg; - const re_string_t *input; re_node_set *nodes; - re_dfastate_t **work_state_log, **state_log; + re_dfastate_t **work_state_log; re_match_context_t *mctx; { reg_errcode_t err; re_dfa_t *dfa = (re_dfa_t *) preg->buffer; int i, j; + re_dfastate_t **state_log_bak; regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1); - int cur_str_idx = re_string_cur_idx (input); + int cur_str_idx = re_string_cur_idx (mctx->input); if (BE (cur_regs == NULL, 0)) return REG_ESPACE; for (i = 0; i < nodes->nelem; ++i) { + unsigned char *buf; int dest_str_idx, subexp_idx, prev_nelem, subexp_len; int node_idx = nodes->elems[i]; unsigned int context; @@ -1569,8 +1597,8 @@ transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) else if (node->type == OP_CONTEXT_NODE && dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF) { - context = re_string_context_at (input, cur_str_idx, mctx->eflags, - preg->newline_anchor); + context = re_string_context_at (mctx->input, cur_str_idx, + mctx->eflags, preg->newline_anchor); if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) continue; subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx; @@ -1580,29 +1608,37 @@ transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) /* `node' is a backreference. At first, set registers to check the backreference. */ - cur_regs[0].rm_so = mctx->match_first; + cur_regs[0].rm_so = 0; cur_regs[0].rm_eo = cur_str_idx; - memcpy (work_state_log + mctx->match_first, - state_log + mctx->match_first, - sizeof (re_dfastate_t *) - * (cur_str_idx - mctx->match_first + 1)); + memcpy (work_state_log, mctx->state_log, + sizeof (re_dfastate_t *) * (cur_str_idx + 1)); mctx->match_last = cur_str_idx; - sift_states_backward (preg, work_state_log, mctx, input, node_idx); - if (!STATE_NODE_CONTAINS (work_state_log[mctx->match_first], - dfa->init_node)) + state_log_bak = mctx->state_log; + mctx->state_log = work_state_log; + sift_states_backward (preg, mctx, node_idx); + if (!STATE_NODE_CONTAINS (work_state_log[0], dfa->init_node)) continue; for (j = 1; j <= preg->re_nsub; ++j) cur_regs[j].rm_so = cur_regs[j].rm_eo = -1; - set_regs (preg, work_state_log, mctx, input, - subexp_idx + 1, cur_regs, node_idx); + set_regs (preg, mctx, subexp_idx + 1, cur_regs, node_idx); + mctx->state_log = state_log_bak; /* Then check that the backreference can match the input string. */ subexp_len = cur_regs[subexp_idx].rm_eo - cur_regs[subexp_idx].rm_so; - if (subexp_len < 0 - || (strncmp ((re_string_get_buffer (input) - + cur_regs[subexp_idx].rm_so), - re_string_get_buffer (input) + cur_str_idx, subexp_len) - != 0)) + if (subexp_len < 0 || cur_str_idx + subexp_len > mctx->input->len) + continue; + + if (cur_str_idx + subexp_len > mctx->input->valid_len + && mctx->input->valid_len < mctx->input->len) + { + reg_errcode_t err; + err = extend_buffers (mctx); + if (BE (err != REG_NOERROR, 0)) + return err; + } + buf = re_string_get_buffer (mctx->input); + if (strncmp (buf + cur_regs[subexp_idx].rm_so, buf + cur_str_idx, + subexp_len) != 0) continue; /* Successfully matched, add a new cache entry. */ @@ -1610,7 +1646,9 @@ transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) err = match_ctx_add_entry (mctx, node_idx, cur_str_idx, dest_str_idx); if (BE (err != REG_NOERROR, 0)) return err; - clean_state_log_if_need (state_log, mctx, dest_str_idx); + err = clean_state_log_if_need (mctx, dest_str_idx); + if (BE (err != REG_NOERROR, 0)) + return err; /* And add the epsilon closures (which is `new_dest_nodes') of the backreference to appropriate state_log. */ @@ -1621,19 +1659,20 @@ transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) new_dest_nodes = dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure; else new_dest_nodes = dfa->eclosures + dfa->nexts[node_idx]; - context = (IS_WORD_CHAR (re_string_byte_at (input, dest_str_idx - 1)) + context = (IS_WORD_CHAR (re_string_byte_at (mctx->input, + dest_str_idx - 1)) ? CONTEXT_WORD : 0); - dest_state = state_log[dest_str_idx]; + dest_state = mctx->state_log[dest_str_idx]; - prev_nelem = ((state_log[cur_str_idx] == NULL) ? 0 - : state_log[cur_str_idx]->nodes.nelem); + prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 + : mctx->state_log[cur_str_idx]->nodes.nelem); /* Add `new_dest_node' to state_log. */ if (dest_state == NULL) { - state_log[dest_str_idx] = re_acquire_state_context (&err, dfa, - new_dest_nodes, - context); - if (BE (state_log[dest_str_idx] == NULL && err != REG_NOERROR, 0)) + mctx->state_log[dest_str_idx] + = re_acquire_state_context (&err, dfa, new_dest_nodes, context); + if (BE (mctx->state_log[dest_str_idx] == NULL + && err != REG_NOERROR, 0)) return err; } else @@ -1643,20 +1682,21 @@ transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) new_dest_nodes); if (BE (err != REG_NOERROR, 0)) return err; - state_log[dest_str_idx] = re_acquire_state_context (&err, dfa, - &dest_nodes, - context); - if (BE (state_log[dest_str_idx] == NULL && err != REG_NOERROR, 0)) + mctx->state_log[dest_str_idx] + = re_acquire_state_context (&err, dfa, &dest_nodes, context); + if (BE (mctx->state_log[dest_str_idx] == NULL + && err != REG_NOERROR, 0)) return err; re_node_set_free (&dest_nodes); } /* We need to check recursively if the backreference can epsilon transit. */ - if (subexp_len == 0 && state_log[cur_str_idx]->nodes.nelem > prev_nelem) + if (subexp_len == 0 + && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) { - err = transit_state_bkref_loop (preg, input, new_dest_nodes, - work_state_log, state_log, mctx); + err = transit_state_bkref_loop (preg, new_dest_nodes, work_state_log, + mctx); if (BE (err != REG_NOERROR, 0)) return err; } @@ -2170,11 +2210,11 @@ find_collation_sequence_value (mbs, mbs_len) byte of the INPUT. */ static int -check_node_accept (preg, node, input, idx, eflags) +check_node_accept (preg, node, mctx, idx) const regex_t *preg; const re_token_t *node; - const re_string_t *input; - int idx, eflags; + const re_match_context_t *mctx; + int idx; { const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; const re_token_t *cur_node; @@ -2183,7 +2223,8 @@ check_node_accept (preg, node, input, idx, eflags) { /* The node has constraints. Check whether the current context satisfies the constraints. */ - unsigned int context = re_string_context_at (input, idx, eflags, + unsigned int context = re_string_context_at (mctx->input, idx, + mctx->eflags, preg->newline_anchor); if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) return 0; @@ -2192,7 +2233,7 @@ check_node_accept (preg, node, input, idx, eflags) else cur_node = node; - ch = re_string_byte_at (input, idx); + ch = re_string_byte_at (mctx->input, idx); if (cur_node->type == CHARACTER) return cur_node->opr.c == ch; else if (cur_node->type == SIMPLE_BRACKET) @@ -2203,17 +2244,69 @@ check_node_accept (preg, node, input, idx, eflags) else return 0; } + +/* Extend the buffers, if the buffers have run out. */ + +static reg_errcode_t +extend_buffers (mctx) + re_match_context_t *mctx; +{ + reg_errcode_t ret; + re_string_t *pstr = mctx->input; + + /* Double the lengthes of the buffers. */ + ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); + if (BE (ret != REG_NOERROR, 0)) + return ret; + + if (mctx->state_log != NULL) + { + /* And double the length of state_log. */ + mctx->state_log = re_realloc (mctx->state_log, re_dfastate_t *, + pstr->bufs_len * 2); + if (BE (mctx->state_log == NULL, 0)) + return REG_ESPACE; + } + + /* Then reconstruct the buffers. */ + if (pstr->icase) + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) + build_wcs_upper_buffer (pstr); + else +#endif /* RE_ENABLE_I18N */ + build_upper_buffer (pstr); + } + else + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) + build_wcs_buffer (pstr); + else +#endif /* RE_ENABLE_I18N */ + { + if (pstr->trans != NULL) + re_string_translate_buffer (pstr); + else + pstr->valid_len = pstr->bufs_len; + } + } + return REG_NOERROR; +} + /* Functions for matching context. */ static reg_errcode_t -match_ctx_init (mctx, eflags, n) +match_ctx_init (mctx, eflags, input, n) re_match_context_t *mctx; - int eflags; - int n; + int eflags, n; + re_string_t *input; { mctx->eflags = eflags; - mctx->match_first = mctx->match_last = -1; + mctx->input = input; + mctx->match_last = -1; if (n > 0) { mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); |