From 65e6becf5b1b9ca1e911986d030b8b31b5dd4cfa Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 24 Nov 2003 19:30:51 +0000 Subject: Update. 2003-11-24 Jakub Jelinek * posix/regex_internal.h (re_token_t): Add word_char bit. Add comment. (re_dfa_t): Add sb_char field. (bitset_mask): New function. * posix/regcomp.c (free_dfa_content): Free sb_char. (init_dfa): Don't initialize word_char unnecessarily. Initialize sb_char. (duplicate_node): Don't duplicate !word_char CHARACTERs with NEXT_WORD_CONSTRAINT constraint or word_char CHARACTERs with NEXT_NOTWORD_CONSTRAINT. Return -1 in *new_idx instead. (duplicate_node_closure): Handle clone_dest == -1 from duplicate_node. (peek_token): Initialize word_char bit. (parse_expression, parse_dup_op): Add comments. (parse_bracket_exp): Don't set bitmask bits for multi-byte char starting bytes here at the beginning. Mask off the bits right before creating SIMPLE_BRACKET. (build_charclass_op): Likewise. * posix/regexec.c (group_nodes_into_DFAstates) : Only set accept bits for single-byte characters. (group_nodes_into_DFAstates): Don't rely on characters 0 .. 127 being single byte encoded and the rest multi-byte. * posix/bug-regex19.c (tests): Add new tests. (do_mb_tests): Initialize t to *test. (main): Fail even on do_mb_tests errors. --- posix/bug-regex19.c | 45 ++++++++++++++++++-- posix/regcomp.c | 112 +++++++++++++++++++++++++++++++++++++++---------- posix/regex_internal.h | 15 +++++++ posix/regexec.c | 37 +++++++++++----- 4 files changed, 174 insertions(+), 35 deletions(-) (limited to 'posix') diff --git a/posix/bug-regex19.c b/posix/bug-regex19.c index 01093e6..f24e0aa 100644 --- a/posix/bug-regex19.c +++ b/posix/bug-regex19.c @@ -102,6 +102,20 @@ static struct test_s {ERE, ".\\b.", "=A=", 0, 0}, {ERE, ".\\b.", "==", 0, -1}, {ERE, ".\\b.", "ABA", 0, -1}, + {ERE, "[^k]\\b[^k]", "AA~", 0, 1}, + {ERE, "[^k]\\b[^k]", "=A=", 0, 0}, + {ERE, "[^k]\\b[^k]", "Ak~kA~", 0, 4}, + {ERE, "[^k]\\b[^k]", "==", 0, -1}, + {ERE, "[^k]\\b[^k]", "ABA", 0, -1}, + {ERE, "[^k]\\b[^k]", "Ak~", 0, -1}, + {ERE, "[^k]\\b[^k]", "k=k", 0, -1}, + {ERE, "[^C]\\b[^C]", "AA~", 0, 1}, + {ERE, "[^C]\\b[^C]", "=A=", 0, 0}, + {ERE, "[^C]\\b[^C]", "AC~CA~", 0, 4}, + {ERE, "[^C]\\b[^C]", "==", 0, -1}, + {ERE, "[^C]\\b[^C]", "ABA", 0, -1}, + {ERE, "[^C]\\b[^C]", "AC~", 0, -1}, + {ERE, "[^C]\\b[^C]", "C=C", 0, -1}, {ERE, "\\<(A|!|.B)", "A=AC", 0, 0}, {ERE, "\\<(A|!|.B)", "=AC", 0, 1}, {ERE, "\\<(A|!|.B)", "!AC", 0, 1}, @@ -140,12 +154,38 @@ static struct test_s {ERE, ".\\<.", "AA~", 0, -1}, {ERE, ".\\<.", "==", 0, -1}, {ERE, ".\\<.", "ABA", 0, -1}, + {ERE, "[^k]\\<[^k]", "=k=A=", 0, 2}, + {ERE, "[^k]\\<[^k]", "kk~", 0, -1}, + {ERE, "[^k]\\<[^k]", "==", 0, -1}, + {ERE, "[^k]\\<[^k]", "ABA", 0, -1}, + {ERE, "[^k]\\<[^k]", "=k=", 0, -1}, + {ERE, "[^C]\\<[^C]", "=C=A=", 0, 2}, + {ERE, "[^C]\\<[^C]", "CC~", 0, -1}, + {ERE, "[^C]\\<[^C]", "==", 0, -1}, + {ERE, "[^C]\\<[^C]", "ABA", 0, -1}, + {ERE, "[^C]\\<[^C]", "=C=", 0, -1}, {ERE, ".\\B.", "ABA", 0, 0}, {ERE, ".\\B.", "=BDC", 0, 1}, + {ERE, "[^k]\\B[^k]", "kkkABA", 0, 3}, + {ERE, "[^k]\\B[^k]", "kBk", 0, -1}, + {ERE, "[^C]\\B[^C]", "CCCABA", 0, 3}, + {ERE, "[^C]\\B[^C]", "CBC", 0, -1}, {ERE, ".(\\b|\\B).", "=~AB", 0, 1}, {ERE, ".(\\b|\\B).", "A=C", 0, 0}, {ERE, ".(\\b|\\B).", "ABC", 0, 0}, {ERE, ".(\\b|\\B).", "=~\\!", 0, -1}, + {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 1}, + {ERE, "[^k](\\b|\\B)[^k]", "A=C", 0, 0}, + {ERE, "[^k](\\b|\\B)[^k]", "ABC", 0, 0}, + {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 3}, + {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, -1}, + {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, -1}, + {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 1}, + {ERE, "[^C](\\b|\\B)[^C]", "A=C", 0, 0}, + {ERE, "[^C](\\b|\\B)[^C]", "ABC", 0, 0}, + {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 3}, + {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, -1}, + {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, -1}, {ERE, "\\b([A]|[!]|.B)", "A=AC", 0, 0}, {ERE, "\\b([A]|[!]|.B)", "=AC", 0, 1}, {ERE, "\\b([A]|[!]|.B)", "!AC", 0, 1}, @@ -288,6 +328,7 @@ do_mb_tests (const struct test_s *test) char string[strlen (test->string) * 4 + 1]; char fail[8 + sizeof ("UTF-8 ")]; + t = *test; t.pattern = pattern; t.string = string; strcpy (fail, "UTF-8 "); @@ -367,9 +408,7 @@ main (void) ret = 1; } ret |= do_one_test (&tests[i], "UTF-8 "); - // Until the implementation is fixed, ignore the results of the - // MB tests. - /* ret |= */do_mb_tests (&tests[i]); + ret |= do_mb_tests (&tests[i]); } return ret; diff --git a/posix/regcomp.c b/posix/regcomp.c index 6fb1844..52f1fa2 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -607,6 +607,9 @@ free_dfa_content (re_dfa_t *dfa) } re_free (dfa->state_table); re_free (dfa->word_char); +#ifdef RE_ENABLE_I18N + re_free (dfa->sb_char); +#endif #ifdef DEBUG re_free (dfa->re_str); #endif @@ -831,7 +834,7 @@ init_dfa (dfa, pat_len) dfa->subexps_alloc = 1; dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); - dfa->word_char = NULL; + /* dfa->word_char = NULL; */ dfa->mb_cur_max = MB_CUR_MAX; #ifdef _LIBC @@ -841,6 +844,25 @@ init_dfa (dfa, pat_len) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); #endif +#ifdef RE_ENABLE_I18N + if (dfa->mb_cur_max > 1) + { + int i, j, ch; + + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + if (BE (dfa->sb_char == NULL, 0)) + return REG_ESPACE; +#ifdef _LIBC + if (dfa->is_utf8) + memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2); + else +#endif + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + if (btowc (ch) != WEOF) + dfa->sb_char[i] |= 1 << j; + } +#endif if (BE (dfa->nodes == NULL || dfa->state_table == NULL || dfa->subexps == NULL, 0)) @@ -1311,6 +1333,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, if (BE (err != REG_NOERROR, 0)) return err; dfa->nexts[clone_node] = dfa->nexts[org_node]; + if (clone_dest == -1) + break; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1348,6 +1372,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, err = duplicate_node (&clone_dest, dfa, org_dest, constraint); if (BE (err != REG_NOERROR, 0)) return err; + if (clone_dest == -1) + break; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1366,13 +1392,16 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, err = duplicate_node (&clone_dest, dfa, org_dest, constraint); if (BE (err != REG_NOERROR, 0)) return err; - ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (ret < 0, 0)) - return REG_ESPACE; - err = duplicate_node_closure (dfa, org_dest, clone_dest, - root_node, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + if (clone_dest != -1) + { + ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); + if (BE (ret < 0, 0)) + return REG_ESPACE; + err = duplicate_node_closure (dfa, org_dest, clone_dest, + root_node, constraint); + if (BE (err != REG_NOERROR, 0)) + return err; + } } else { @@ -1387,6 +1416,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, err = duplicate_node (&clone_dest, dfa, org_dest, constraint); if (BE (err != REG_NOERROR, 0)) return err; + if (clone_dest == -1) + break; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1426,7 +1457,21 @@ duplicate_node (new_idx, dfa, org_idx, constraint) int *new_idx, org_idx; unsigned int constraint; { - int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1); + int dup_idx; + + if (dfa->nodes[org_idx].type == CHARACTER + && (((constraint & NEXT_WORD_CONSTRAINT) + && !dfa->nodes[org_idx].word_char) + || ((constraint & NEXT_NOTWORD_CONSTRAINT) + && dfa->nodes[org_idx].word_char))) + { + /* \W etc. can never match. Don't duplicate them, instead + tell the caller they shouldn't be added to edests. */ + *new_idx = -1; + return REG_NOERROR; + } + + dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1); if (BE (dup_idx == -1, 0)) return REG_ESPACE; dfa->nodes[dup_idx].constraint = constraint; @@ -1614,6 +1659,7 @@ peek_token (token, input, syntax) c = re_string_peek_byte (input, 0); token->opr.c = c; + token->word_char = 0; #ifdef RE_ENABLE_I18N token->mb_partial = 0; if (input->mb_cur_max > 1 && @@ -1636,6 +1682,17 @@ peek_token (token, input, syntax) c2 = re_string_peek_byte_case (input, 1); token->opr.c = c2; token->type = CHARACTER; +#ifdef RE_ENABLE_I18N + if (input->mb_cur_max > 1) + { + wint_t wc = re_string_wchar_at (input, + re_string_cur_idx (input) + 1); + token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; + } + else +#endif + token->word_char = IS_WORD_CHAR (c2) != 0; + switch (c2) { case '|': @@ -1739,6 +1796,16 @@ peek_token (token, input, syntax) } token->type = CHARACTER; +#ifdef RE_ENABLE_I18N + if (input->mb_cur_max > 1) + { + wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); + token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; + } + else +#endif + token->word_char = IS_WORD_CHAR (token->opr.c); + switch (c) { case '\n': @@ -2140,6 +2207,8 @@ parse_expression (regexp, preg, token, syntax, nest, err) /* Then we can these characters as normal characters. */ token->type = CHARACTER; + /* mb_partial and word_char bits should be initialized already + by peek_token. */ tree = re_dfa_add_tree_node (dfa, NULL, NULL, token); if (BE (tree == NULL, 0)) { @@ -2475,6 +2544,8 @@ parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) re_string_set_index (regexp, start_idx); *token = start_token; token->type = CHARACTER; + /* mb_partial and word_char bits should be already initialized by + peek_token. */ return dup_elem; } @@ -2952,7 +3023,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) if (token->type == OP_NON_MATCH_LIST) { #ifdef RE_ENABLE_I18N - int i; mbcset->non_match = 1; #else /* not RE_ENABLE_I18N */ non_match = 1; @@ -2966,12 +3036,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) *err = REG_BADPAT; goto parse_bracket_exp_free_return; } -#ifdef RE_ENABLE_I18N - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - bitset_set (sbcset, i); -#endif /* RE_ENABLE_I18N */ } /* We treat the first ']' as a normal character. */ @@ -3126,6 +3190,11 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) if (non_match) #endif /* not RE_ENABLE_I18N */ bitset_not (sbcset); +#ifdef RE_ENABLE_I18N + /* Ensure only single byte characters are set. */ + if (dfa->mb_cur_max > 1) + bitset_mask (sbcset, dfa->sb_char); +#endif /* RE_ENABLE_I18N */ /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3493,16 +3562,11 @@ build_charclass_op (dfa, trans, class_name, extra, not, err) if (not) { #ifdef RE_ENABLE_I18N - int i; /* if (syntax & RE_HAT_LISTS_NOT_NEWLINE) bitset_set(cset->sbcset, '\0'); */ mbcset->non_match = 1; - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - bitset_set (sbcset, i); #else /* not RE_ENABLE_I18N */ non_match = 1; #endif /* not RE_ENABLE_I18N */ @@ -3536,6 +3600,12 @@ build_charclass_op (dfa, trans, class_name, extra, not, err) #endif /* not RE_ENABLE_I18N */ bitset_not (sbcset); +#ifdef RE_ENABLE_I18N + /* Ensure only single byte characters are set. */ + if (dfa->mb_cur_max > 1) + bitset_mask (sbcset, dfa->sb_char); +#endif + /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; br_token.opr.sbcset = sbcset; diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 5111f6d..f8e99ee 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -133,6 +133,7 @@ typedef unsigned int *re_bitset_ptr_t; static inline void bitset_not (bitset set); static inline void bitset_merge (bitset dest, const bitset src); static inline void bitset_not_merge (bitset dest, const bitset src); +static inline void bitset_mask (bitset dest, const bitset src); #define PREV_WORD_CONSTRAINT 0x0001 #define PREV_NOTWORD_CONSTRAINT 0x0002 @@ -281,8 +282,11 @@ typedef struct unsigned int constraint : 10; /* context constraint */ unsigned int duplicated : 1; #ifdef RE_ENABLE_I18N + /* These 2 bits can be moved into the union if needed (e.g. if running out + of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */ unsigned int mb_partial : 1; #endif + unsigned int word_char : 1; } re_token_t; #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT) @@ -601,6 +605,7 @@ struct re_dfa_t re_dfastate_t *init_state_begbuf; bin_tree_t *str_tree; bin_tree_storage_t *str_tree_storage; + re_bitset_ptr_t sb_char; int str_tree_storage_idx; /* number of subexpressions `re_nsub' is in regex_t. */ @@ -711,6 +716,16 @@ bitset_not_merge (dest, src) dest[i] |= ~src[i]; } +static inline void +bitset_mask (dest, src) + bitset dest; + const bitset src; +{ + int bitset_i; + for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + dest[bitset_i] &= src[bitset_i]; +} + #if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES /* Inline functions for re_string. */ static inline int diff --git a/posix/regexec.c b/posix/regexec.c index 0b52485..58ac9c8 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -3341,7 +3341,12 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) } else if (type == OP_PERIOD) { - bitset_set_all (accepts); +#ifdef RE_ENABLE_I18N + if (dfa->mb_cur_max > 1) + bitset_merge (accepts, dfa->sb_char); + else +#endif + bitset_set_all (accepts); if (!(preg->syntax & RE_DOT_NEWLINE)) bitset_clear (accepts, '\n'); if (preg->syntax & RE_DOT_NOT_NULL) @@ -3362,8 +3367,6 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) match it the context. */ if (constraint) { - int word_char_max; - if (constraint & NEXT_NEWLINE_CONSTRAINT) { int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); @@ -3379,16 +3382,28 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) continue; } - /* This assumes ASCII compatible locale. We cannot say - anything about the non-ascii chars. */ - word_char_max - = dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS; if (constraint & NEXT_WORD_CONSTRAINT) - for (j = 0; j < word_char_max; ++j) - accepts[j] &= dfa->word_char[j]; + { +#ifdef RE_ENABLE_I18N + if (dfa->mb_cur_max > 1) + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]); + else +#endif + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= dfa->word_char[j]; + } if (constraint & NEXT_NOTWORD_CONSTRAINT) - for (j = 0; j < word_char_max; ++j) - accepts[j] &= ~dfa->word_char[j]; + { +#ifdef RE_ENABLE_I18N + if (dfa->mb_cur_max > 1) + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]); + else +#endif + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= ~dfa->word_char[j]; + } } /* Then divide `accepts' into DFA states, or create a new -- cgit v1.1