aboutsummaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
Diffstat (limited to 'posix')
-rw-r--r--posix/Makefile3
-rw-r--r--posix/Versions4
-rw-r--r--posix/bug-regex4.c1
-rw-r--r--posix/bug-regex6.c74
-rw-r--r--posix/regcomp.c60
-rw-r--r--posix/regex_internal.h45
-rw-r--r--posix/regexec.c94
7 files changed, 195 insertions, 86 deletions
diff --git a/posix/Makefile b/posix/Makefile
index 008154d..db58cf5 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -71,7 +71,7 @@ tests := tstgetopt testfnm runtests runptests \
tst-getlogin tst-mmap tst-getaddrinfo tst-truncate \
tst-truncate64 tst-fork tst-fnmatch tst-regexloc tst-dir \
tst-chmod bug-regex1 bug-regex2 bug-regex3 bug-regex4 \
- tst-gnuglob tst-regex bug-regex5
+ tst-gnuglob tst-regex bug-regex5 bug-regex6
ifeq (yes,$(build-shared))
test-srcs := globtest
tests += wordexp-test tst-exec tst-spawn
@@ -125,6 +125,7 @@ tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata
testcases.h: TESTS TESTS2C.sed
sed -f TESTS2C.sed < $< > $@T
diff --git a/posix/Versions b/posix/Versions
index c79a042..07dc49a 100644
--- a/posix/Versions
+++ b/posix/Versions
@@ -105,6 +105,10 @@ libc {
# Extended Interface.
fnmatch;
}
+ GLIBC_2.2.6 {
+ # For syscall wrapper
+ __nanosleep;
+ }
GLIBC_PRIVATE {
# functions which have an additional interface since they are
# are cancelable.
diff --git a/posix/bug-regex4.c b/posix/bug-regex4.c
index 76841e9..5ecca1a 100644
--- a/posix/bug-regex4.c
+++ b/posix/bug-regex4.c
@@ -35,7 +35,6 @@ main (void)
setlocale (LC_ALL, "C");
- setlocale (LC_ALL, "C");
s = re_compile_pattern ("ab[cde]", 7, &regex);
if (s != NULL)
{
diff --git a/posix/bug-regex6.c b/posix/bug-regex6.c
new file mode 100644
index 0000000..9a06898
--- /dev/null
+++ b/posix/bug-regex6.c
@@ -0,0 +1,74 @@
+/* Test for regexec.
+ Copyright (C) 2002 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <jakub@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sys/types.h>
+#include <regex.h>
+#include <locale.h>
+#include <stdio.h>
+
+int
+main (int argc, char *argv[])
+{
+ regex_t re;
+ regmatch_t mat[10];
+ int i, j, ret = 0;
+ char *locales[] = { "C", "de_DE.UTF-8" };
+ char *string = "http://www.regex.com/pattern/matching.html#intro";
+ regmatch_t expect[10] = {
+ { 0, 48 }, { 0, 5 }, { 0, 4 }, { 5, 20 }, { 7, 20 }, { 20, 42 },
+ { -1, -1 }, { -1, -1 }, { 42, 48 }, { 43, 48 } };
+
+ for (i = 0; i < sizeof (locales) / sizeof (locales[0]); ++i)
+ {
+ if (setlocale (LC_ALL, locales[i]) == NULL)
+ {
+ puts ("cannot set locale");
+ ret = 1;
+ }
+ else if (regcomp (&re,
+ "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?",
+ REG_EXTENDED) != REG_NOERROR)
+ {
+ puts ("cannot compile expression \"[a-f]*\"");
+ ret = 1;
+ }
+ else if (regexec (&re, string, 10, mat, 0) == REG_NOMATCH)
+ {
+ puts ("no match");
+ ret = 1;
+ }
+ else
+ {
+ if (! memcmp (mat, expect, sizeof (mat)))
+ printf ("matching ok for %s locale\n", locales[i]);
+ else
+ {
+ printf ("matching failed for %s locale:\n", locales[i]);
+ ret = 1;
+ for (j = 0; j < 9; ++j)
+ if (mat[j].rm_so != -1)
+ printf ("%d: %.*s\n", j, mat[j].rm_eo - mat[j].rm_so,
+ string + mat[j].rm_so);
+ }
+ }
+ }
+
+ return ret;
+}
diff --git a/posix/regcomp.c b/posix/regcomp.c
index b9b0560..5136042 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -724,7 +724,6 @@ re_compile_internal (preg, pattern, length, syntax)
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
if (dfa == NULL)
return REG_ESPACE;
- memset (dfa, '\0', sizeof (re_dfa_t));
preg->allocated = sizeof (re_dfa_t);
}
preg->buffer = (unsigned char *) dfa;
@@ -781,6 +780,9 @@ init_dfa (dfa, pat_len)
int pat_len;
{
int table_size;
+
+ memset (dfa, '\0', sizeof (re_dfa_t));
+
dfa->nodes_alloc = pat_len + 1;
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
@@ -1001,8 +1003,6 @@ calc_first (dfa, node)
switch (type)
{
#ifdef DEBUG
- case OP_OPEN_SUBEXP:
- case OP_CLOSE_SUBEXP:
case OP_OPEN_BRACKET:
case OP_CLOSE_BRACKET:
case OP_OPEN_DUP_NUM:
@@ -1028,6 +1028,8 @@ calc_first (dfa, node)
case SIMPLE_BRACKET:
case OP_BACK_REF:
case ANCHOR:
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
node->first = idx;
break;
case OP_DUP_PLUS:
@@ -1041,14 +1043,6 @@ calc_first (dfa, node)
case OP_ALT:
node->first = idx;
break;
- case SUBEXP:
- if (node->left == NULL)
- {
- if (node->next == -1)
- calc_next (dfa, node);
- node->first = node->next;
- break;
- }
/* else fall through */
default:
#ifdef DEBUG
@@ -1161,7 +1155,9 @@ calc_epsdest (dfa, node)
}
re_node_set_init_2 (dfa->edests + idx, left, right);
}
- else if (dfa->nodes[idx].type == ANCHOR)
+ else if (dfa->nodes[idx].type == ANCHOR
+ || dfa->nodes[idx].type == OP_OPEN_SUBEXP
+ || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
re_node_set_init_1 (dfa->edests + idx, node->next);
}
}
@@ -2055,8 +2051,9 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err)
reg_errcode_t *err;
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
- bin_tree_t *tree;
+ bin_tree_t *tree, *left_par, *right_par;
size_t cur_nsub;
+ int new_idx;
cur_nsub = preg->re_nsub++;
if (dfa->subexps_alloc < preg->re_nsub)
{
@@ -2073,30 +2070,39 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err)
}
dfa->subexps[cur_nsub].start = dfa->nodes_len;
dfa->subexps[cur_nsub].end = -1;
+
+ new_idx = re_dfa_add_node (dfa, *token, 0);
+ left_par = create_tree (NULL, NULL, 0, new_idx);
+ if (BE (new_idx == -1 || left_par == NULL, 0))
+ return *err = REG_ESPACE, NULL;
+ dfa->nodes[new_idx].opr.idx = cur_nsub;
*token = fetch_token (regexp, syntax);
/* The subexpression may be a null string. */
if (token->type == OP_CLOSE_SUBEXP)
- {
- tree = create_tree (NULL, NULL, SUBEXP, 0);
- if (BE (tree == NULL, 0))
- return *err = REG_ESPACE, NULL;
- dfa->subexps[cur_nsub].end = dfa->nodes_len;
- }
+ tree = NULL;
else
{
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
- dfa->subexps[cur_nsub].end = dfa->nodes_len;
- if (BE (token->type != OP_CLOSE_SUBEXP, 0))
- {
- free_bin_tree (tree);
- *err = REG_BADPAT;
- return NULL;
- }
- tree = create_tree (tree, NULL, SUBEXP, 0);
}
+ if (BE (token->type != OP_CLOSE_SUBEXP, 0))
+ {
+ free_bin_tree (tree);
+ *err = REG_BADPAT;
+ return NULL;
+ }
+ new_idx = re_dfa_add_node (dfa, *token, 0);
+ dfa->subexps[cur_nsub].end = dfa->nodes_len;
+ right_par = create_tree (NULL, NULL, 0, new_idx);
+ tree = ((tree == NULL) ? right_par
+ : create_tree (tree, right_par, CONCAT, 0));
+ tree = create_tree (left_par, tree, CONCAT, 0);
+ if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
+ return *err = REG_ESPACE, NULL;
+ dfa->nodes[new_idx].opr.idx = cur_nsub;
+
return tree;
}
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 95ae46e..2062254 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -96,8 +96,6 @@ typedef enum
NON_TYPE = 0,
/* Token type, these are used only by token. */
- OP_OPEN_SUBEXP,
- OP_CLOSE_SUBEXP,
OP_OPEN_BRACKET,
OP_CLOSE_BRACKET,
OP_CHARSET_RANGE,
@@ -124,6 +122,8 @@ typedef enum
#endif /* RE_ENABLE_I18N */
/* Node type, These are used by token, node, tree. */
+ OP_OPEN_SUBEXP,
+ OP_CLOSE_SUBEXP,
OP_PERIOD,
CHARACTER,
END_OF_RE,
@@ -142,24 +142,18 @@ typedef enum
#ifdef RE_ENABLE_I18N
typedef struct
{
- /* If this character set is the non-matching list. */
- unsigned int non_match : 1;
-
/* Multibyte characters. */
wchar_t *mbchars;
- int nmbchars;
/* Collating symbols. */
# ifdef _LIBC
int32_t *coll_syms;
# endif
- int ncoll_syms;
/* Equivalence classes. */
# ifdef _LIBC
int32_t *equiv_classes;
# endif
- int nequiv_classes;
/* Range expressions. */
# ifdef _LIBC
@@ -169,17 +163,32 @@ typedef struct
wchar_t *range_starts;
wchar_t *range_ends;
# endif /* not _LIBC */
- int nranges;
/* Character classes. */
wctype_t *char_classes;
+
+ /* If this character set is the non-matching list. */
+ unsigned int non_match : 1;
+
+ /* # of multibyte characters. */
+ int nmbchars;
+
+ /* # of collating symbols. */
+ int ncoll_syms;
+
+ /* # of equivalence classes. */
+ int nequiv_classes;
+
+ /* # of range expressions. */
+ int nranges;
+
+ /* # of character classes. */
int nchar_classes;
} re_charset_t;
#endif /* RE_ENABLE_I18N */
typedef struct
{
- re_token_type_t type;
union
{
unsigned char c; /* for CHARACTER */
@@ -195,6 +204,11 @@ typedef struct
re_node_set *bkref_eclosure;
} *ctx_info;
} opr;
+#if __GNUC__ >= 2
+ re_token_type_t type : 8;
+#else
+ re_token_type_t type;
+#endif
unsigned int constraint : 10; /* context constraint */
unsigned int duplicated : 1;
#ifdef RE_ENABLE_I18N
@@ -203,8 +217,9 @@ typedef struct
} re_token_t;
#define IS_EPSILON_NODE(type) \
- ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS || \
- (type) == OP_DUP_QUESTION || (type) == ANCHOR)
+ ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \
+ || (type) == OP_DUP_QUESTION || (type) == ANCHOR \
+ || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP)
#define ACCEPT_MB_NODE(type) \
((type) == COMPLEX_BRACKET || (type) == OP_PERIOD)
@@ -214,9 +229,6 @@ struct re_string_t
/* Indicate the raw buffer which is the original string passed as an
argument of regexec(), re_search(), etc.. */
const unsigned char *raw_mbs;
- /* Index in RAW_MBS. Each character mbs[i] corresponds to
- raw_mbs[raw_mbs_idx + i]. */
- int raw_mbs_idx;
/* Store the multibyte string. In case of "case insensitive mode" like
REG_ICASE, upper cases of the string are stored, otherwise MBS points
the same address that RAW_MBS points. */
@@ -230,6 +242,9 @@ struct re_string_t
wint_t *wcs;
mbstate_t cur_state;
#endif
+ /* Index in RAW_MBS. Each character mbs[i] corresponds to
+ raw_mbs[raw_mbs_idx + i]. */
+ int raw_mbs_idx;
/* The length of the valid characters in the buffers. */
int valid_len;
/* The length of the buffers MBS, MBS_CASE, and WCS. */
diff --git a/posix/regexec.c b/posix/regexec.c
index 2c7a277..5dd3a06 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -58,6 +58,8 @@ static int check_halt_node_context (const re_dfa_t *dfa, int node,
static int check_halt_state_context (const regex_t *preg,
const re_dfastate_t *state,
const re_match_context_t *mctx, int idx);
+static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
+ int cur_idx, int nmatch);
static int proceed_next_node (const regex_t *preg,
const re_match_context_t *mctx,
int *pidx, int node, re_node_set *eps_via_nodes);
@@ -886,24 +888,38 @@ proceed_next_node (preg, mctx, pidx, node, eps_via_nodes)
re_node_set *eps_via_nodes;
{
re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
- int i, dest_node = -1, err;
+ int i, err, dest_node, cur_entity;
+ dest_node = -1;
+ cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE)
+ ? dfa->nodes[node].opr.ctx_info->entity : node);
if (IS_EPSILON_NODE (dfa->nodes[node].type))
{
+ int dest_entity = INT_MAX;
err = re_node_set_insert (eps_via_nodes, node);
if (BE (err < 0, 0))
return -1;
for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
{
- int candidate = mctx->state_log[*pidx]->nodes.elems[i];
- if (!re_node_set_contains (dfa->edests + node, candidate)
- && !(dfa->nodes[candidate].type == OP_CONTEXT_NODE
- && re_node_set_contains (dfa->edests + node,
- dfa->nodes[candidate].opr.ctx_info->entity)))
- continue;
- dest_node = candidate;
+ int candidate, candidate_entity;
+ candidate = mctx->state_log[*pidx]->nodes.elems[i];
+ candidate_entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE)
+ ? dfa->nodes[candidate].opr.ctx_info->entity
+ : candidate);
+ if (!re_node_set_contains (dfa->edests + node, candidate))
+ if (candidate == candidate_entity
+ || !re_node_set_contains (dfa->edests + node, candidate_entity))
+ continue;
+
/* In order to avoid infinite loop like "(a*)*". */
- if (!re_node_set_contains (eps_via_nodes, dest_node))
- break;
+ if (cur_entity > candidate_entity
+ && re_node_set_contains (eps_via_nodes, candidate))
+ continue;
+
+ if (dest_entity > candidate_entity)
+ {
+ dest_node = candidate;
+ dest_entity = candidate_entity;
+ }
}
#ifdef DEBUG
assert (dest_node != -1);
@@ -986,9 +1002,8 @@ set_regs (preg, mctx, nmatch, pmatch, last_node)
int last_node;
{
re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
- int idx, cur_node, node_entity, real_nmatch;
+ int idx, cur_node, real_nmatch;
re_node_set eps_via_nodes;
- int i;
#ifdef DEBUG
assert (nmatch > 1);
assert (mctx->state_log != NULL);
@@ -998,36 +1013,7 @@ set_regs (preg, mctx, nmatch, pmatch, last_node)
re_node_set_init_empty (&eps_via_nodes);
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
{
- node_entity = ((dfa->nodes[cur_node].type == OP_CONTEXT_NODE)
- ? dfa->nodes[cur_node].opr.ctx_info->entity : cur_node);
- for (i = 1; i < real_nmatch; ++i)
- {
- if (dfa->subexps[i - 1].start == dfa->subexps[i - 1].end)
- {
- /* In case of the null subexpression like '()'. */
- if (dfa->subexps[i - 1].start == node_entity)
- {
- pmatch[i].rm_so = idx;
- pmatch[i].rm_eo = idx;
- }
- }
- else if (dfa->subexps[i - 1].start <= node_entity
- && node_entity < dfa->subexps[i - 1].end)
- {
- if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo != -1)
- /* We are at the first node of this sub expression. */
- {
- pmatch[i].rm_so = idx;
- pmatch[i].rm_eo = -1;
- }
- }
- else
- {
- if (pmatch[i].rm_so != -1 && pmatch[i].rm_eo == -1)
- /* We are at the last node of this sub expression. */
- pmatch[i].rm_eo = idx;
- }
- }
+ update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
if (idx == pmatch[0].rm_eo && cur_node == last_node)
break;
@@ -1040,6 +1026,30 @@ set_regs (preg, mctx, nmatch, pmatch, last_node)
return REG_NOERROR;
}
+static void
+update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
+ re_dfa_t *dfa;
+ regmatch_t *pmatch;
+ int cur_node, cur_idx, nmatch;
+{
+ int type = dfa->nodes[cur_node].type;
+ int reg_num;
+ if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
+ return;
+ reg_num = dfa->nodes[cur_node].opr.idx + 1;
+ if (reg_num >= nmatch)
+ return;
+ if (type == OP_OPEN_SUBEXP)
+ {
+ /* We are at the first node of this sub expression. */
+ pmatch[reg_num].rm_so = cur_idx;
+ pmatch[reg_num].rm_eo = -1;
+ }
+ else if (type == OP_CLOSE_SUBEXP)
+ /* We are at the first node of this sub expression. */
+ pmatch[reg_num].rm_eo = cur_idx;
+ }
+
#define NUMBER_OF_STATE 1
/* This function checks the STATE_LOG from the MCTX->match_last to 0