aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libstdc++-v3/ChangeLog33
-rw-r--r--libstdc++-v3/include/bits/regex.tcc35
-rw-r--r--libstdc++-v3/include/bits/regex_automaton.h21
-rw-r--r--libstdc++-v3/include/bits/regex_automaton.tcc9
-rw-r--r--libstdc++-v3/include/bits/regex_compiler.h2
-rw-r--r--libstdc++-v3/include/bits/regex_compiler.tcc20
-rw-r--r--libstdc++-v3/include/bits/regex_executor.h10
-rw-r--r--libstdc++-v3/include/bits/regex_executor.tcc143
-rw-r--r--libstdc++-v3/include/bits/regex_scanner.tcc2
-rw-r--r--libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc1
10 files changed, 175 insertions, 101 deletions
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 2a2c068..6cc371c 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,36 @@
+2014-04-27 Tim Shen <timshen91@gmail.com>
+
+ * include/bits/regex_automaton.h (_NFA<>::_M_insert_repeat):
+ Add _S_opcode_repeat support to distingush a loop from
+ _S_opcode_alternative.
+ * include/bits/regex_automaton.tcc (_State_base::_M_print,
+ _State_base::_M_dot, _NFA<>::_M_eliminate_dummy,
+ _StateSeq<>::_M_clone): Likewise.
+ * include/bits/regex_compiler.tcc (_Compiler<>::_M_quantifier):
+ Likewise.
+ * include/bits/regex_executor.tcc (_Executor<>::_M_dfs): Likewise.
+ * include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma):
+ Uglify local variable __i.
+ * include/bits/regex_compiler.h (_BracketMatcher<>::_M_make_cache):
+ Use size_t instead of int to compare with vector::size().
+
+2014-04-27 Tim Shen <timshen91@gmail.com>
+
+ * include/bits/regex_executor.h: Add _M_rep_count to track how
+ many times this repeat node are visited.
+ * include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more,
+ _Executor<>::_M_dfs): Use _M_rep_count to prevent entering
+ infinite loop.
+
+2014-04-27 Tim Shen <timshen91@gmail.com>
+
+ * include/bits/regex.tcc (__regex_algo_impl<>): Remove
+ _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use
+ _GLIBCXX_REGEX_USE_THOMPSON_NFA instead.
+ * include/bits/regex_automaton.h: Remove quantifier counting variable.
+ * include/bits/regex_automaton.tcc (_State_base::_M_dot):
+ Adjust debug NFA dump.
+
2014-04-25 Lars Gullik Bjønnes <larsbj@gullik.org>
PR libstdc++/60710
diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc
index 5fa1f01..0d737a0 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -28,12 +28,12 @@
* Do not attempt to use it directly. @headername{regex}
*/
-// See below __regex_algo_impl to get what this is talking about. The default
-// value 1 indicated a conservative optimization without giving up worst case
-// performance.
-#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
-#endif
+// A non-standard switch to let the user pick the matching algorithm.
+// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
+// algorithm will be used. This algorithm is not enabled by default,
+// and cannot be used if the regex contains back-references, but has better
+// (polynomial instead of exponential) worst case performace.
+// See __regex_algo_impl below.
namespace std _GLIBCXX_VISIBILITY(default)
{
@@ -66,24 +66,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
for (auto& __it : __res)
__it.matched = false;
- // This function decide which executor to use under given circumstances.
- // The _S_auto policy now is the following: if a NFA has no
- // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
- // quantifiers (*, +, ?), the BFS executor will be used, other wise
- // DFS executor. This is because DFS executor has a exponential upper
- // bound, but better best-case performace. Meanwhile, BFS executor can
- // effectively prevent from exponential-long time matching (which must
- // contains many quantifiers), but it's slower in average.
- //
- // For simple regex, BFS executor could be 2 or more times slower than
- // DFS executor.
- //
- // Of course, BFS executor cannot handle back-references.
+ // __policy is used by testsuites so that they can use Thompson NFA
+ // without defining a macro. Users should define
+ // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
bool __ret;
if (!__re._M_automaton->_M_has_backref
- && (__policy == _RegexExecutorPolicy::_S_alternate
- || __re._M_automaton->_M_quant_count
- > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
+#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
+ && __policy == _RegexExecutorPolicy::_S_alternate
+#endif
+ )
{
_Executor<_BiIter, _Alloc, _TraitsT, false>
__executor(__s, __e, __m, __re, __flags);
diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h
index a442cfe..1b0da14 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -52,6 +52,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
_S_opcode_unknown,
_S_opcode_alternative,
+ _S_opcode_repeat,
_S_opcode_backref,
_S_opcode_line_begin_assertion,
_S_opcode_line_end_assertion,
@@ -74,9 +75,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
size_t _M_backref_index; // for _S_opcode_backref
struct
{
- // for _S_opcode_alternative.
- _StateIdT _M_quant_index;
- // for _S_opcode_alternative or _S_opcode_subexpr_lookahead
+ // for _S_opcode_alternative, _S_opcode_repeat and
+ // _S_opcode_subexpr_lookahead
_StateIdT _M_alt;
// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
// quantifiers (ungreedy if set true)
@@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
explicit
_NFA_base(_FlagT __f)
: _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
- _M_quant_count(0), _M_has_backref(false)
+ _M_has_backref(false)
{ }
_NFA_base(_NFA_base&&) = default;
@@ -145,7 +145,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_FlagT _M_flags;
_StateIdT _M_start_state;
_SizeT _M_subexpr_count;
- _SizeT _M_quant_count;
bool _M_has_backref;
};
@@ -175,7 +174,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
- __tmp._M_quant_index = this->_M_quant_count++;
+ __tmp._M_next = __next;
+ __tmp._M_alt = __alt;
+ return _M_insert_state(std::move(__tmp));
+ }
+
+ _StateIdT
+ _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
+ {
+ _StateT __tmp(_S_opcode_repeat);
+ // It labels every quantifier to make greedy comparison easier in BFS
+ // approach.
__tmp._M_next = __next;
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc
index 1476ae2..e0ac3f9 100644
--- a/libstdc++-v3/include/bits/regex_automaton.tcc
+++ b/libstdc++-v3/include/bits/regex_automaton.tcc
@@ -41,6 +41,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
switch (_M_opcode)
{
case _S_opcode_alternative:
+ case _S_opcode_repeat:
ostr << "alt next=" << _M_next << " alt=" << _M_alt;
break;
case _S_opcode_subexpr_begin:
@@ -72,11 +73,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
switch (_M_opcode)
{
case _S_opcode_alternative:
+ case _S_opcode_repeat:
__ostr << __id << " [label=\"" << __id << "\\nALT\"];\n"
<< __id << " -> " << _M_next
- << " [label=\"epsilon\", tailport=\"s\"];\n"
+ << " [label=\"next\", tailport=\"s\"];\n"
<< __id << " -> " << _M_alt
- << " [label=\"epsilon\", tailport=\"n\"];\n";
+ << " [label=\"alt\", tailport=\"n\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
@@ -174,6 +176,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
== _S_opcode_dummy)
__it._M_next = (*this)[__it._M_next]._M_next;
if (__it._M_opcode == _S_opcode_alternative
+ || __it._M_opcode == _S_opcode_repeat
|| __it._M_opcode == _S_opcode_subexpr_lookahead)
while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode
== _S_opcode_dummy)
@@ -198,6 +201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto __id = _M_nfa._M_insert_state(__dup);
__m[__u] = __id;
if (__dup._M_opcode == _S_opcode_alternative
+ || __dup._M_opcode == _S_opcode_repeat
|| __dup._M_opcode == _S_opcode_subexpr_lookahead)
if (__dup._M_alt != _S_invalid_state_id && __m[__dup._M_alt] == -1)
__stack.push(__dup._M_alt);
@@ -217,6 +221,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__ref._M_next = __m[__ref._M_next];
}
if (__ref._M_opcode == _S_opcode_alternative
+ || __ref._M_opcode == _S_opcode_repeat
|| __ref._M_opcode == _S_opcode_subexpr_lookahead)
if (__ref._M_alt != _S_invalid_state_id)
{
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index f5a198f..d7e2162 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -421,7 +421,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
void
_M_make_cache(true_type)
{
- for (int __i = 0; __i < _M_cache.size(); __i++)
+ for (size_t __i = 0; __i < _M_cache.size(); __i++)
_M_cache[static_cast<_UnsignedCharT>(__i)] =
_M_apply(__i, false_type());
}
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 128dac1..3cf9e457 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -188,8 +188,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
__init();
auto __e = _M_pop();
- _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
- __e._M_start, __neg));
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id,
+ __e._M_start, __neg));
__e._M_append(__r);
_M_stack.push(__r);
}
@@ -197,8 +197,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
__init();
auto __e = _M_pop();
- __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
- __neg));
+ __e._M_append(_M_nfa._M_insert_repeat(_S_invalid_state_id,
+ __e._M_start, __neg));
_M_stack.push(__e);
}
else if (_M_match_token(_ScannerT::_S_token_opt))
@@ -206,8 +206,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__init();
auto __e = _M_pop();
auto __end = _M_nfa._M_insert_dummy();
- _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
- __e._M_start, __neg));
+ _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id,
+ __e._M_start, __neg));
__e._M_append(__end);
__r._M_append(__end);
_M_stack.push(__r);
@@ -244,8 +244,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
auto __tmp = __r._M_clone();
_StateSeqT __s(_M_nfa,
- _M_nfa._M_insert_alt(_S_invalid_state_id,
- __tmp._M_start, __neg));
+ _M_nfa._M_insert_repeat(_S_invalid_state_id,
+ __tmp._M_start, __neg));
__tmp._M_append(__s);
__e._M_append(__s);
}
@@ -261,8 +261,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
for (long __i = 0; __i < __n; ++__i)
{
auto __tmp = __r._M_clone();
- auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
- __end, __neg);
+ auto __alt = _M_nfa._M_insert_repeat(__tmp._M_start,
+ __end, __neg);
__stack.push(__alt);
__e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
}
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 708c78e..c110b88 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_results(__results),
_M_match_queue(__dfs_mode ? nullptr
: new vector<pair<_StateIdT, _ResultsVec>>()),
+ _M_rep_count(_M_nfa.size()),
_M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())),
_M_flags((__flags & regex_constants::match_prev_avail)
? (__flags
@@ -104,6 +105,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
private:
template<bool __match_mode>
void
+ _M_rep_once_more(_StateIdT);
+
+ template<bool __match_mode>
+ void
_M_dfs(_StateIdT __start);
template<bool __match_mode>
@@ -149,9 +154,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_ResultsVec& _M_results;
// Used in BFS, saving states that need to be considered for the next
// character.
- std::unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue;
+ unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue;
// Used in BFS, indicating that which state is already visited.
- std::unique_ptr<vector<bool>> _M_visited;
+ vector<pair<_BiIter, int>> _M_rep_count;
+ unique_ptr<vector<bool>> _M_visited;
_FlagT _M_flags;
// To record current solution.
_StateIdT _M_start_state;
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 68a5e04..7f89933 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -161,7 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
- // TODO: Use a function vector to dispatch, instead of using switch-case.
+ // __rep_count records how many times (__rep_count.second)
+ // this node is visited under certain input iterator
+ // (__rep_count.first). This prevent the executor from entering
+ // infinite loop by refusing to continue when it's already been
+ // visited more than twice. It's `twice` instead of `once` because
+ // we need to spare one more time for potential group capture.
+ template<typename _BiIter, typename _Alloc, typename _TraitsT,
+ bool __dfs_mode>
+ template<bool __match_mode>
+ void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
+ _M_rep_once_more(_StateIdT __i)
+ {
+ const auto& __state = _M_nfa[__i];
+ auto& __rep_count = _M_rep_count[__i];
+ if (__rep_count.second == 0 || __rep_count.first != _M_current)
+ {
+ auto __back = __rep_count;
+ __rep_count.first = _M_current;
+ __rep_count.second = 1;
+ _M_dfs<__match_mode>(__state._M_alt);
+ __rep_count = __back;
+ }
+ else
+ {
+ if (__rep_count.second < 2)
+ {
+ __rep_count.second++;
+ _M_dfs<__match_mode>(__state._M_alt);
+ __rep_count.second--;
+ }
+ }
+ };
+
template<typename _BiIter, typename _Alloc, typename _TraitsT,
bool __dfs_mode>
template<bool __match_mode>
@@ -184,69 +216,61 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// of this quantifier". Executing _M_next first or _M_alt first don't
// mean the same thing, and we need to choose the correct order under
// given greedy mode.
- case _S_opcode_alternative:
- // Greedy.
- if (!__state._M_neg)
- {
- // "Once more" is preferred in greedy mode.
- _M_dfs<__match_mode>(__state._M_alt);
- // If it's DFS executor and already accepted, we're done.
- if (!__dfs_mode || !_M_has_sol)
- _M_dfs<__match_mode>(__state._M_next);
- }
- else // Non-greedy mode
- {
- if (__dfs_mode)
- {
- // vice-versa.
+ case _S_opcode_repeat:
+ {
+ // Greedy.
+ if (!__state._M_neg)
+ {
+ _M_rep_once_more<__match_mode>(__i);
+ // If it's DFS executor and already accepted, we're done.
+ if (!__dfs_mode || !_M_has_sol)
_M_dfs<__match_mode>(__state._M_next);
- if (!_M_has_sol)
- _M_dfs<__match_mode>(__state._M_alt);
- }
- else
- {
- // DON'T attempt anything, because there's already another
- // state with higher priority accepted. This state cannot be
- // better by attempting its next node.
- if (!_M_has_sol)
- {
- _M_dfs<__match_mode>(__state._M_next);
- // DON'T attempt anything if it's already accepted. An
- // accepted state *must* be better than a solution that
- // matches a non-greedy quantifier one more time.
- if (!_M_has_sol)
- _M_dfs<__match_mode>(__state._M_alt);
- }
- }
+ }
+ else // Non-greedy mode
+ {
+ if (__dfs_mode)
+ {
+ // vice-versa.
+ _M_dfs<__match_mode>(__state._M_next);
+ if (!_M_has_sol)
+ _M_rep_once_more<__match_mode>(__i);
+ }
+ else
+ {
+ // DON'T attempt anything, because there's already another
+ // state with higher priority accepted. This state cannot be
+ // better by attempting its next node.
+ if (!_M_has_sol)
+ {
+ _M_dfs<__match_mode>(__state._M_next);
+ // DON'T attempt anything if it's already accepted. An
+ // accepted state *must* be better than a solution that
+ // matches a non-greedy quantifier one more time.
+ if (!_M_has_sol)
+ _M_rep_once_more<__match_mode>(__i);
+ }
+ }
+ }
}
break;
case _S_opcode_subexpr_begin:
- // If there's nothing changed since last visit, do NOT continue.
- // This prevents the executor from get into infinite loop when using
- // "()*" to match "".
- if (!_M_cur_results[__state._M_subexpr].matched
- || _M_cur_results[__state._M_subexpr].first != _M_current)
- {
- auto& __res = _M_cur_results[__state._M_subexpr];
- auto __back = __res.first;
- __res.first = _M_current;
- _M_dfs<__match_mode>(__state._M_next);
- __res.first = __back;
- }
+ {
+ auto& __res = _M_cur_results[__state._M_subexpr];
+ auto __back = __res.first;
+ __res.first = _M_current;
+ _M_dfs<__match_mode>(__state._M_next);
+ __res.first = __back;
+ }
break;
case _S_opcode_subexpr_end:
- if (_M_cur_results[__state._M_subexpr].second != _M_current
- || _M_cur_results[__state._M_subexpr].matched != true)
- {
- auto& __res = _M_cur_results[__state._M_subexpr];
- auto __back = __res;
- __res.second = _M_current;
- __res.matched = true;
- _M_dfs<__match_mode>(__state._M_next);
- __res = __back;
- }
- else
+ {
+ auto& __res = _M_cur_results[__state._M_subexpr];
+ auto __back = __res;
+ __res.second = _M_current;
+ __res.matched = true;
_M_dfs<__match_mode>(__state._M_next);
+ __res = __back;
+ }
break;
case _S_opcode_line_begin_assertion:
if (_M_at_begin())
@@ -339,6 +363,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
}
break;
+ case _S_opcode_alternative:
+ _M_dfs<__match_mode>(__state._M_alt);
+ if (!__dfs_mode || !_M_has_sol)
+ _M_dfs<__match_mode>(__state._M_next);
+ break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
}
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index 5332d2e..f501ff6 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -335,7 +335,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
else if (__c == 'x' || __c == 'u')
{
_M_value.erase();
- for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
+ for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
{
if (_M_current == _M_end
|| !_M_ctype.is(_CtypeT::xdigit, *_M_current))
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
index 3fdbdade..c33d1b6 100644
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
@@ -50,6 +50,7 @@ test01()
const char s[] = "";
VERIFY( regex_match_debug(s, m, re) );
}
+ VERIFY(regex_match_debug("", regex("(?:)*")));
}
int