diff options
-rw-r--r-- | ChangeLog | 41 | ||||
-rw-r--r-- | posix/BOOST.tests | 27 | ||||
-rw-r--r-- | posix/PCRE.tests | 28 | ||||
-rw-r--r-- | posix/regcomp.c | 243 | ||||
-rw-r--r-- | posix/regex_internal.c | 1 | ||||
-rw-r--r-- | posix/regex_internal.h | 1 | ||||
-rw-r--r-- | posix/regexec.c | 79 | ||||
-rw-r--r-- | posix/rxspencer/tests | 7 |
8 files changed, 245 insertions, 182 deletions
@@ -1,3 +1,24 @@ +2003-12-23 Paolo Bonzini <bonzini@gnu.org> + + * posix/regex_internal.c (re_dfa_add_node): Initialize opt_subexp. + * posix/regex_internal.h (re_token_type_t): Put OP_DUP_PLUS + among the tokens, rather than among the epsilon-transiting nodes. + (re_token_t): Add the opt_subexp flag. + * posix/regcomp.c (optimize_utf8, calc_first, + calc_next, calc_epsdest): Don't consider OP_DUP_PLUS. + (mark_opt_subexp, mark_opt_subexp_iter): New functions. + (parse_dup_op): Mostly rewritten, lowering OP_DUP_PLUS to + OP_DUP_ASTERISK and marking optional subexpressions + as such using mark_opt_subexp. + * posix/regexec.c (set_regs): Initialize PREV_INDEX_MATCH + and pass it to update_regs. + (update_regs): Use the PREV_INDEX_MATCH parameter, together + with the opt_subexp flag, in order to discard a final empty + match of a repeated subexpression. + * posix/BOOST.tests: Adjust test vectors. + * posix/PCRE.tests: Likewise. + * posix/rxspencer/tests: Likewise. + 2000-05-22 Jakub Jelinek <jakub@redhat.com> * sysdeps/i386/fpu/bits/mathinline.h (__expm1_code): Avoid using ?: @@ -9,7 +30,7 @@ * posix/tst-rxspencer.c: Likewise. Based on a patch by Alex Davis <alex14641@yahoo.com>. -2002-12-17 Paolo Bonzini <bonzini@gnu.org> +2003-12-17 Paolo Bonzini <bonzini@gnu.org> * posix/regex_internal.h [!_LIBC] (internal_function): Define. (re_string_allocate, re_string_construct, re_string_reconstruct, @@ -60,7 +81,7 @@ (re_string_peek_byte_case, re_fetch_byte_case): Change declaration from ANSI to K&R. -2002-12-16 Paolo Bonzini <bonzini@gnu.org> +2003-12-16 Paolo Bonzini <bonzini@gnu.org> * posix/regexec.c (build_trtable): Don't allocate the trtable until state->word_trtable is known. Don't hardcode UINT_BITS @@ -172,7 +193,7 @@ * sysdeps/unix/sysv/linux/powerpc/sys/procfs.h [!__PPC64_ELF_H]: Extend conditional to include typedef elf_vrreg_t. -2002-12-17 Paolo Bonzini <bonzini@gnu.org> +2003-12-17 Paolo Bonzini <bonzini@gnu.org> * posix/regexec.c (re_search_internal): Limit search to the beginning of the buffer if the initial states are empty for @@ -197,7 +218,7 @@ * posix/regex_internal.h: Make sure the regex code compile with non-GCC compilers by hiding attributes. -2002-12-16 Jakub Jelinek <jakub@redhat.com> +2003-12-16 Jakub Jelinek <jakub@redhat.com> Paolo Bonzini <bonzini@gnu.org> * posix/regexec.c (group_nodes_into_DFAstates): Never produce @@ -6084,7 +6105,7 @@ invalid input if the -c option is used. (main): Correctly append IGNORE string for -c option. -2002-04-06 Kaz Kojima <kkojima@rr.iij4u.or.jp> +2003-04-06 Kaz Kojima <kkojima@rr.iij4u.or.jp> * sysdeps/sh/bits/atomic.h: Moved to ... * sysdeps/unix/sysv/linux/sh/bits/atomic.h: ... here. Add comments. @@ -8037,7 +8058,7 @@ * inet/rcmd.c (rresvport_af): Avoid using invliad values. Wrap around in search if port IPPORT_RESERVED/2 has been test. -2002-02-20 Paolo Bonzini <bonzini@gnu.org> +2003-02-20 Paolo Bonzini <bonzini@gnu.org> * posix/regcomp.c: Remove inclusions. * posix/regexec.c: Likewise. @@ -9032,7 +9053,7 @@ * io/ftwtest-sh: Add test for relative path argument to nftw() with FTW_CHDIR option. -2002-01-12 Franz Sirl <Franz.Sirl-kernel@lauterbach.com> +2003-01-12 Franz Sirl <Franz.Sirl-kernel@lauterbach.com> * sysdeps/unix/sysv/linux/kernel-features.h (__ASSUME_VFORK_SYSCALL): Define for powerpc. @@ -9045,12 +9066,12 @@ (INLINE_SYSCALL): Make use of INTERNAL_SYSCALL. * sysdeps/unix/sysv/linux/powerpc/powerpc32/vfork.S: New file. -2002-01-09 Richard Henderson <rth@redhat.com> +2003-01-09 Richard Henderson <rth@redhat.com> * sysdeps/alpha/dl-machine.h (elf_machine_rela): Compute DTPREL64 and TPREL64 without loadbase applied. -2002-01-09 Richard Henderson <rth@redhat.com> +2003-01-09 Richard Henderson <rth@redhat.com> * sysdeps/unix/alpha/sysdep.S: Use correct definition of errno for NOT_IN_libc. @@ -9443,7 +9464,7 @@ * sysdeps/unix/sysv/linux/m68k/bits/stat.h: Add nanosecond fields. -2002-01-03 Paul Eggert <eggert@twinsun.com> +2003-01-03 Paul Eggert <eggert@twinsun.com> * malloc/obstack.h (__INT_TO_PTR) [__STDC__]: Cast result to (void *) to avoid diagnostic with native c89 on SGI IRIX 6.5 diff --git a/posix/BOOST.tests b/posix/BOOST.tests index 32b6c92..98fd3b6 100644 --- a/posix/BOOST.tests +++ b/posix/BOOST.tests @@ -432,27 +432,22 @@ a(b|c){2,4}d abcbd 0 5 3 4 a(b|c){2,4}d abcbcd 0 6 4 5 a(b|c){2,}d abcd 0 4 2 3 a(b|c){2,}d abcbd 0 5 3 4 -; perl only: -a(b|c?)+d abcd 0 4 3 3 -a(b+|((c)*))+d abd 0 3 2 2 2 2 -1 -1 -a(b+|((c)*))+d abcd 0 4 3 3 3 3 2 3 +; perl only: these conflict with the POSIX test below +;a(b|c?)+d abcd 0 4 3 3 +;a(b+|((c)*))+d abd 0 3 2 2 2 2 -1 -1 +;a(b+|((c)*))+d abcd 0 4 3 3 3 3 2 3 ; posix only: - match_default extended REG_EXTENDED REG_STARTEND -; XXX FIXME the following 4 tests fail ATM -;a(b|c?)+d abcd 0 4 2 3 -;a(b|((c)*))+d abcd 0 4 2 3 2 3 2 3 -;a(b+|((c)*))+d abd 0 3 1 2 -1 -1 -1 -1 -;a(b+|((c)*))+d abcd 0 4 2 3 2 3 2 3 - +a(b|c?)+d abcd 0 4 2 3 +a(b|((c)*))+d abcd 0 4 2 3 2 3 2 3 +a(b+|((c)*))+d abd 0 3 1 2 -1 -1 -1 -1 +a(b+|((c)*))+d abcd 0 4 2 3 2 3 2 3 a(b|((c)*))+d ad 0 2 1 1 1 1 -1 -1 - -; XXX FIXME the following 3 tests fail ATM -;a(b|((c)*))*d abcd 0 4 2 3 2 3 2 3 -;a(b+|((c)*))*d abd 0 3 1 2 -1 -1 -1 -1 -;a(b+|((c)*))*d abcd 0 4 2 3 2 3 2 3 - +a(b|((c)*))*d abcd 0 4 2 3 2 3 2 3 +a(b+|((c)*))*d abd 0 3 1 2 -1 -1 -1 -1 +a(b+|((c)*))*d abcd 0 4 2 3 2 3 2 3 a(b|((c)*))*d ad 0 2 1 1 1 1 -1 -1 - match_default normal REG_PERL diff --git a/posix/PCRE.tests b/posix/PCRE.tests index 182f29f..7ea5b9e 100644 --- a/posix/PCRE.tests +++ b/posix/PCRE.tests @@ -1151,13 +1151,13 @@ No match /(abc|)+/ abc 0: abc - 1: + 1: abc abcabc 0: abcabc - 1: + 1: abc abcabcabc 0: abcabcabc - 1: + 1: abc xyz 0: 1: @@ -1165,46 +1165,44 @@ No match /([a]*)*/ a 0: a - 1: + 1: a aaaaa 0: aaaaa - 1: + 1: aaaaa /([ab]*)*/ a 0: a - 1: + 1: a b 0: b - 1: + 1: b ababab 0: ababab - 1: + 1: ababab aaaabcde 0: aaaab - 1: + 1: aaaab bbbb 0: bbbb - 1: + 1: bbbb /([^a]*)*/ b 0: b - 1: + 1: b bbbb 0: bbbb - 1: + 1: bbbb aaa 0: - 1: /([^ab]*)*/ cccc 0: cccc - 1: + 1: cccc abab 0: - 1: /abc/ abc diff --git a/posix/regcomp.c b/posix/regcomp.c index 1a7e519..38b1b9c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -135,6 +135,8 @@ static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa, const re_token_t *token) __attribute ((noinline)); static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); +static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa); +static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx); /* This table gives an error message for each of the error codes listed in regex.h. Obviously the order here has to be same as there. @@ -1031,7 +1033,6 @@ optimize_utf8 (dfa) case END_OF_RE: case OP_DUP_ASTERISK: case OP_DUP_QUESTION: - case OP_DUP_PLUS: case OP_OPEN_SUBEXP: case OP_CLOSE_SUBEXP: break; @@ -1150,6 +1151,7 @@ calc_first (dfa, node) case OP_CLOSE_BRACKET: case OP_OPEN_DUP_NUM: case OP_CLOSE_DUP_NUM: + case OP_DUP_PLUS: case OP_NON_MATCH_LIST: case OP_OPEN_COLL_ELEM: case OP_CLOSE_COLL_ELEM: @@ -1176,14 +1178,6 @@ calc_first (dfa, node) case OP_CLOSE_SUBEXP: node->first = idx; break; - case OP_DUP_PLUS: -#ifdef DEBUG - assert (node->left != NULL); -#endif - if (node->left->first == -1) - calc_first (dfa, node->left); - node->first = node->left->first; - break; case OP_ALT: node->first = idx; break; @@ -1223,7 +1217,6 @@ calc_next (dfa, node) switch (type) { case OP_DUP_ASTERISK: - case OP_DUP_PLUS: node->next = idx; break; case CONCAT: @@ -1258,7 +1251,6 @@ calc_epsdest (dfa, node) if (node->type == 0) { if (dfa->nodes[idx].type == OP_DUP_ASTERISK - || dfa->nodes[idx].type == OP_DUP_PLUS || dfa->nodes[idx].type == OP_DUP_QUESTION) { if (node->left->first == -1) @@ -2377,8 +2369,8 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ static bin_tree_t * -parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) - bin_tree_t *dup_elem; +parse_dup_op (elem, regexp, dfa, token, syntax, err) + bin_tree_t *elem; re_string_t *regexp; re_dfa_t *dfa; re_token_t *token; @@ -2386,15 +2378,14 @@ parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) reg_errcode_t *err; { re_token_t dup_token; - bin_tree_t *tree = dup_elem, *work_tree; - int start_idx = re_string_cur_idx (regexp); + bin_tree_t *tree = NULL; + int i, start, end, start_idx = re_string_cur_idx (regexp); re_token_t start_token = *token; + if (token->type == OP_OPEN_DUP_NUM) { - int i; - int end = 0; - int start = fetch_number (regexp, token, syntax); - bin_tree_t *elem; + end = 0; + start = fetch_number (regexp, token, syntax); if (start == -1) { if (token->type == CHARACTER && token->opr.c == ',') @@ -2415,123 +2406,104 @@ parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) if (BE (start == -2 || end == -2, 0)) { /* Invalid sequence. */ - if (token->type == OP_CLOSE_DUP_NUM) - goto parse_dup_op_invalid_interval; - else - goto parse_dup_op_ebrace; + if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) + { + if (token->type == END_OF_RE) + *err = REG_EBRACE; + else + *err = REG_BADBR; + + return NULL; + } + + /* If the syntax bit is set, rollback. */ + re_string_set_index (regexp, start_idx); + *token = start_token; + token->type = CHARACTER; + /* mb_partial and word_char bits should be already initialized by + peek_token. */ + return elem; } - if (BE ((start == 0 && end == 0) || tree == NULL, 0)) + + if (BE (end != -1 && start > end, 0)) { - /* We treat "<re>{0}" and "<re>{0,0}" as null string. - Similarly "<re>{0}{m,n}". */ - fetch_token (token, regexp, syntax); + /* First number greater than second. */ + *err = REG_BADBR; return NULL; } + } + else + { + start = (token->type == OP_DUP_PLUS) ? 1 : 0; + end = (token->type == OP_DUP_QUESTION) ? 1 : -1; + } + + /* Treat "<re>{0}*" etc. as "<re>{0}". */ + if (BE (elem == NULL, 0)) + start = end = 0; - /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ - elem = tree; - for (i = 1; i < start; ++i) + /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ + else if (BE (start > 0, 0)) + { + tree = elem; + for (i = 2; i <= start; ++i) { - work_tree = duplicate_tree (elem, dfa); - tree = create_tree (dfa, tree, work_tree, CONCAT, 0); - if (BE (work_tree == NULL || tree == NULL, 0)) + elem = duplicate_tree (elem, dfa); + tree = create_tree (dfa, tree, elem, CONCAT, 0); + if (BE (elem == NULL || tree == NULL, 0)) goto parse_dup_op_espace; } + } - if (end == -1) - { - /* We treat "<re>{0,}" as "<re>*". */ - dup_token.type = OP_DUP_ASTERISK; - if (start > 0) - { - elem = duplicate_tree (elem, dfa); - work_tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - tree = create_tree (dfa, tree, work_tree, CONCAT, 0); - if (BE (elem == NULL || work_tree == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - } - else - { - tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; - } - } - else if (BE (start > end, 0)) + if (BE (end != start, 1)) + { + dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION); + if (BE (start > 0, 0)) { - /* First number greater than first. */ - *err = REG_BADBR; - return NULL; + elem = duplicate_tree (elem, dfa); + if (BE (elem == NULL, 0)) + goto parse_dup_op_espace; + + /* This subexpression will be marked as optional, so that + empty matches do not touch the registers. */ + mark_opt_subexp (elem, dfa); + + /* Prepare the tree with the modifier. */ + elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); + tree = create_tree (dfa, tree, elem, CONCAT, 0); } - else if (end - start > 0) + else { - /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */ - dup_token.type = OP_DUP_QUESTION; - if (start > 0) - { - elem = duplicate_tree (elem, dfa); - elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - tree = create_tree (dfa, tree, elem, CONCAT, 0); - if (BE (elem == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - } - else - { - tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; - } - for (i = 1; i < end - start; ++i) + /* We do not need to duplicate the tree because we have not + created it yet. */ + mark_opt_subexp (elem, dfa); + tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); + } + + if (BE (elem == NULL || tree == NULL, 0)) + goto parse_dup_op_espace; + + /* This loop is actually executed only when end != -1, + to rewrite <re>{0,n} as <re>?<re>?<re>?... We have + already created the start+1-th copy. */ + for (i = start + 2; i <= end; ++i) + { + elem = duplicate_tree (elem, dfa); + tree = create_tree (dfa, tree, elem, CONCAT, 0); + if (BE (elem == NULL || tree == NULL, 0)) { - work_tree = duplicate_tree (elem, dfa); - tree = create_tree (dfa, tree, work_tree, CONCAT, 0); - if (BE (work_tree == NULL || tree == NULL, 0)) - { - *err = REG_ESPACE; - return NULL; - } + *err = REG_ESPACE; + return NULL; } - } - } - /* Treat "<re>{0}*" etc. as "<re>{0}". */ - else if (tree == NULL) - ; - else - { - tree = re_dfa_add_tree_node (dfa, tree, NULL, token); - if (BE (tree == NULL, 0)) - { - *err = REG_ESPACE; - return NULL; - } + } } + fetch_token (token, regexp, syntax); return tree; parse_dup_op_espace: *err = REG_ESPACE; return NULL; - - parse_dup_op_ebrace: - if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) - { - *err = REG_EBRACE; - return NULL; - } - goto parse_dup_op_rollback; - parse_dup_op_invalid_interval: - if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) - { - *err = REG_BADBR; - return NULL; - } - parse_dup_op_rollback: - re_string_set_index (regexp, start_idx); - *token = start_token; - token->type = CHARACTER; - /* mb_partial and word_char bits should be already initialized by - peek_token. */ - return dup_elem; } /* Size of the names for collating symbol/equivalence_class/character_class. @@ -3737,6 +3709,47 @@ re_dfa_add_tree_node (dfa, left, right, token) return create_tree (dfa, left, right, 0, new_idx); } +/* Mark the tree SRC as an optional subexpression. */ + +static void +mark_opt_subexp (src, dfa) + const bin_tree_t *src; + re_dfa_t *dfa; +{ + /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is + a subexpression. */ + if (src->type == CONCAT + && src->left->type == NON_TYPE + && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP) + mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx); +} + + +/* Recursive tree walker for mark_opt_subexp. */ + +static void +mark_opt_subexp_iter (src, dfa, idx) + const bin_tree_t *src; + re_dfa_t *dfa; +{ + int node_idx; + + if (src->type == NON_TYPE) + { + node_idx = src->node_idx; + if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP + || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP) + && dfa->nodes[node_idx].opr.idx == idx) + dfa->nodes[node_idx].opt_subexp = 1; + } + + if (src->left != NULL) + mark_opt_subexp_iter (src->left, dfa, idx); + + if (src->right != NULL) + mark_opt_subexp_iter (src->right, dfa, idx); +} + /* Duplicate the node SRC, and return new node. */ diff --git a/posix/regex_internal.c b/posix/regex_internal.c index 45b45aa..4cff96f 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -1306,6 +1306,7 @@ re_dfa_add_node (dfa, token, mode) dfa->nodes_alloc = new_nodes_alloc; } dfa->nodes[dfa->nodes_len] = token; + dfa->nodes[dfa->nodes_len].opt_subexp = 0; dfa->nodes[dfa->nodes_len].duplicated = 0; dfa->nodes[dfa->nodes_len].constraint = 0; return dfa->nodes_len++; diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 8f11ba7..18414a5 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -281,6 +281,7 @@ typedef struct #endif unsigned int constraint : 10; /* context constraint */ unsigned int duplicated : 1; + unsigned int opt_subexp : 1; #ifdef RE_ENABLE_I18N /* These 2 bits can be moved into the union if needed (e.g. if running out of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */ diff --git a/posix/regexec.c b/posix/regexec.c index b0f9a53..973d1c7 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -62,7 +62,8 @@ static int check_halt_node_context (const re_dfa_t *dfa, int node, static int check_halt_state_context (const regex_t *preg, const re_dfastate_t *state, const re_match_context_t *mctx, int idx) internal_function; -static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node, +static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, + regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) internal_function; static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs, const re_match_context_t *mctx, @@ -1277,11 +1278,13 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) regmatch_t *pmatch; int fl_backtrack; { - re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; int idx, cur_node, real_nmatch; re_node_set eps_via_nodes; struct re_fail_stack_t *fs; - struct re_fail_stack_t fs_body = {0, 2, NULL}; + struct re_fail_stack_t fs_body = { 0, 2, NULL }; + regmatch_t *prev_idx_match; + #ifdef DEBUG assert (nmatch > 1); assert (mctx->state_log != NULL); @@ -1290,15 +1293,23 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) { fs = &fs_body; fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); + if (fs->stack == NULL) + return REG_ESPACE; } else fs = NULL; + cur_node = dfa->init_node; real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1; re_node_set_init_empty (&eps_via_nodes); + + prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch); + memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch); + for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) { - update_regs (dfa, pmatch, cur_node, idx, real_nmatch); + update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch); + if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) { int reg_idx; @@ -1362,31 +1373,55 @@ free_fail_stack_return (fs) } static void -update_regs (dfa, pmatch, cur_node, cur_idx, nmatch) +update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch) re_dfa_t *dfa; - regmatch_t *pmatch; + regmatch_t *pmatch, *prev_idx_match; int cur_node, cur_idx, nmatch; { int type = dfa->nodes[cur_node].type; - int reg_num; - if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP) - return; - reg_num = dfa->nodes[cur_node].opr.idx + 1; - if (reg_num >= nmatch) - return; if (type == OP_OPEN_SUBEXP) { + int reg_num = dfa->nodes[cur_node].opr.idx + 1; + /* We are at the first node of this sub expression. */ - pmatch[reg_num].rm_so = cur_idx; - pmatch[reg_num].rm_eo = -1; + if (reg_num < nmatch) + { + pmatch[reg_num].rm_so = cur_idx; + pmatch[reg_num].rm_eo = -1; + } } else if (type == OP_CLOSE_SUBEXP) - /* We are at the first node of this sub expression. */ - pmatch[reg_num].rm_eo = cur_idx; + { + int reg_num = dfa->nodes[cur_node].opr.idx + 1; + if (reg_num < nmatch) + { + /* We are at the last node of this sub expression. */ + if (pmatch[reg_num].rm_so < cur_idx) + { + pmatch[reg_num].rm_eo = cur_idx; + /* This is a non-empty match or we are not inside an optional + subexpression. Accept this right away. */ + memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); + } + else + { + if (dfa->nodes[cur_node].opt_subexp + && prev_idx_match[reg_num].rm_so != -1) + /* We transited through an empty match for an optional + subexpression, like (a?)*, and this is not the subexp's + first match. Copy back the old content of the registers + so that matches of an inner subexpression are undone as + well, like in ((a?))*. */ + memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); + else + /* We completed a subexpression, but it may be part of + an optional one, so do not update PREV_IDX_MATCH. */ + pmatch[reg_num].rm_eo = cur_idx; + } + } + } } -#define NUMBER_OF_STATE 1 - /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 and sift the nodes in each states according to the following rules. Updated state_log will be wrote to STATE_LOG. @@ -3279,7 +3314,7 @@ out_free: character ch. See group_nodes_into_DFAstates. */ for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) ; - + /* j-th destination accepts the word character ch. */ if (IS_WORD_CHAR (ch)) trtable[ch] = dest_states_word[j]; @@ -3298,7 +3333,7 @@ out_free: 2 * SBC_MAX); if (BE (trtable == NULL, 0)) goto out_free; - + /* For all characters ch...: */ for (i = 0; i < BITSET_UINTS; ++i) for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; @@ -3310,13 +3345,13 @@ out_free: character ch. See group_nodes_into_DFAstates. */ for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) ; - + /* j-th destination accepts the word character ch. */ trtable[ch] = dest_states[j]; trtable[ch + SBC_MAX] = dest_states_word[j]; } } - + /* new line */ if (bitset_contain (acceptable, NEWLINE_CHAR)) { diff --git a/posix/rxspencer/tests b/posix/rxspencer/tests index 8e93133..f4a3fb3 100644 --- a/posix/rxspencer/tests +++ b/posix/rxspencer/tests @@ -431,7 +431,7 @@ a(b|c)*d - abcd abcd c a(b|c)+d - abd abd b a(b|c)+d - abcd abcd c a(b|c?)+d - ad ad @d -a(b|c?)+d - abcd abcd @d +a(b|c?)+d - abcd abcd c a(b|c){0,0}d - ad ad - a(b|c){0,1}d - ad ad - a(b|c){0,1}d - abd abd b @@ -452,9 +452,8 @@ a(b|c){2,4}d - abcbd abcbd b a(b|c){2,4}d - abcbcd abcbcd c a(b|c){2,}d - abcd abcd c a(b|c){2,}d - abcbd abcbd b -a(b+|((c)*))+d - abd abd @d,@d,- -# XXX Needs to be checked. -#a(b+|((c)*))+d - abcd abcd @d,@d,- +a(b+|((c)*))+d - abd abd b,-,- +a(b+|((c)*))+d - abcd abcd c,c,c # check out the STARTEND option [abc] &# a(b)c b |