diff options
Diffstat (limited to 'libgo/go/regexp')
-rw-r--r-- | libgo/go/regexp/all_test.go | 63 | ||||
-rw-r--r-- | libgo/go/regexp/onepass.go | 90 | ||||
-rw-r--r-- | libgo/go/regexp/onepass_test.go | 99 | ||||
-rw-r--r-- | libgo/go/regexp/regexp.go | 15 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/parse.go | 24 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/parse_test.go | 9 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/regexp.go | 4 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/simplify_test.go | 4 | ||||
-rw-r--r-- | libgo/go/regexp/testdata/re2-search.txt | 5 |
9 files changed, 181 insertions, 132 deletions
diff --git a/libgo/go/regexp/all_test.go b/libgo/go/regexp/all_test.go index d78ae6a..88391ff 100644 --- a/libgo/go/regexp/all_test.go +++ b/libgo/go/regexp/all_test.go @@ -113,6 +113,25 @@ func TestMatchFunction(t *testing.T) { } } +func copyMatchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, "") + if re == nil { + return + } + m1 := re.MatchString(test.text) + m2 := re.Copy().MatchString(test.text) + if m1 != m2 { + t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t", + test, m1, m2, len(test.matches) > 0) + } +} + +func TestCopyMatch(t *testing.T) { + for _, test := range findTests { + copyMatchTest(t, &test) + } +} + type ReplaceTest struct { pattern, replacement, input, output string } @@ -201,6 +220,12 @@ var replaceTests = []ReplaceTest{ // Substitution when subexpression isn't found {"(x)?", "$1", "123", "123"}, {"abc", "$1", "123", "123"}, + + // Substitutions involving a (x){0} + {"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"}, + {"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"}, + {"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, + {"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, } var replaceLiteralTests = []ReplaceTest{ @@ -334,6 +359,19 @@ var metaTests = []MetaTest{ {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, } +var literalPrefixTests = []MetaTest{ + // See golang.org/issue/11175. + // output is unused. + {`^0^0$`, ``, `0`, false}, + {`^0^`, ``, ``, false}, + {`^0$`, ``, `0`, true}, + {`$0^`, ``, ``, false}, + {`$0$`, ``, ``, false}, + {`^^0$$`, ``, ``, false}, + {`^$^$`, ``, ``, false}, + {`$$0^^`, ``, ``, false}, +} + func TestQuoteMeta(t *testing.T) { for _, tc := range metaTests { // Verify that QuoteMeta returns the expected string. @@ -365,7 +403,7 @@ func TestQuoteMeta(t *testing.T) { } func TestLiteralPrefix(t *testing.T) { - for _, tc := range metaTests { + for _, tc := range append(metaTests, literalPrefixTests...) { // Literal method needs to scan the pattern. re := MustCompile(tc.pattern) str, complete := re.LiteralPrefix() @@ -665,3 +703,26 @@ func BenchmarkOnePassLongNotPrefix(b *testing.B) { re.Match(x) } } + +func BenchmarkMatchParallelShared(b *testing.B) { + x := []byte("this is a long line that contains foo bar baz") + re := MustCompile("foo (ba+r)? baz") + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + re.Match(x) + } + }) +} + +func BenchmarkMatchParallelCopied(b *testing.B) { + x := []byte("this is a long line that contains foo bar baz") + re := MustCompile("foo (ba+r)? baz") + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + re := re.Copy() + for pb.Next() { + re.Match(x) + } + }) +} diff --git a/libgo/go/regexp/onepass.go b/libgo/go/regexp/onepass.go index e6f4285..2ce3902 100644 --- a/libgo/go/regexp/onepass.go +++ b/libgo/go/regexp/onepass.go @@ -59,7 +59,12 @@ func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { buf.WriteRune(i.Rune[0]) pc, i = i.Out, &p.Inst[i.Out] } - return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc + if i.Op == syntax.InstEmptyWidth && + syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && + p.Inst[i.Out].Op == syntax.InstMatch { + complete = true + } + return buf.String(), complete, pc } // OnePassNext selects the next actionable state of the prog, based on the input character. @@ -108,10 +113,6 @@ func (q *queueOnePass) clear() { q.nextIndex = 0 } -func (q *queueOnePass) reset() { - q.nextIndex = 0 -} - func (q *queueOnePass) contains(u uint32) bool { if u >= uint32(len(q.sparse)) { return false @@ -308,25 +309,9 @@ func makeOnePass(p *onePassProg) *onePassProg { var ( instQueue = newQueue(len(p.Inst)) visitQueue = newQueue(len(p.Inst)) - build func(uint32, *queueOnePass) check func(uint32, map[uint32]bool) bool onePassRunes = make([][]rune, len(p.Inst)) ) - build = func(pc uint32, q *queueOnePass) { - if q.contains(pc) { - return - } - inst := p.Inst[pc] - switch inst.Op { - case syntax.InstAlt, syntax.InstAltMatch: - q.insert(inst.Out) - build(inst.Out, q) - q.insert(inst.Arg) - case syntax.InstMatch, syntax.InstFail: - default: - q.insert(inst.Out) - } - } // check that paths from Alt instructions are unambiguous, and rebuild the new // program as a onepass program @@ -385,11 +370,11 @@ func makeOnePass(p *onePassProg) *onePassProg { m[pc] = inst.Op == syntax.InstMatch break case syntax.InstRune: - ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } + instQueue.insert(inst.Out) if len(inst.Rune) == 0 { onePassRunes[pc] = []rune{} inst.Next = []uint32{inst.Out} @@ -413,11 +398,11 @@ func makeOnePass(p *onePassProg) *onePassProg { } inst.Op = syntax.InstRune case syntax.InstRune1: - ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } + instQueue.insert(inst.Out) runes := []rune{} // expand case-folded runes if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { @@ -437,19 +422,19 @@ func makeOnePass(p *onePassProg) *onePassProg { } inst.Op = syntax.InstRune case syntax.InstRuneAny: - ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } + instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRune...) inst.Next = []uint32{inst.Out} case syntax.InstRuneAnyNotNL: - ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } + instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { @@ -463,24 +448,12 @@ func makeOnePass(p *onePassProg) *onePassProg { instQueue.insert(uint32(p.Start)) m := make(map[uint32]bool, len(p.Inst)) for !instQueue.empty() { - pc := instQueue.next() - inst := p.Inst[pc] visitQueue.clear() + pc := instQueue.next() if !check(uint32(pc), m) { p = notOnePass break } - switch inst.Op { - case syntax.InstAlt, syntax.InstAltMatch: - instQueue.insert(inst.Out) - instQueue.insert(inst.Arg) - case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: - instQueue.insert(inst.Out) - case syntax.InstMatch: - case syntax.InstFail: - case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: - default: - } } if p != notOnePass { for i := range p.Inst { @@ -490,47 +463,6 @@ func makeOnePass(p *onePassProg) *onePassProg { return p } -// walk visits each Inst in the prog once, and applies the argument -// function(ip, next), in pre-order. -func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { - var walk1 func(uint32) - progQueue := newQueue(len(prog.Inst)) - walk1 = func(ip uint32) { - if progQueue.contains(ip) { - return - } - progQueue.insert(ip) - inst := prog.Inst[ip] - switch inst.Op { - case syntax.InstAlt, syntax.InstAltMatch: - for _, f := range funcs { - f(ip, inst.Out) - f(ip, inst.Arg) - } - walk1(inst.Out) - walk1(inst.Arg) - default: - for _, f := range funcs { - f(ip, inst.Out) - } - walk1(inst.Out) - } - } - walk1(uint32(prog.Start)) -} - -// find returns the Insts that match the argument predicate function -func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { - matches = []uint32{} - - for ip := range prog.Inst { - if f(prog, ip) { - matches = append(matches, uint32(ip)) - } - } - return -} - var notOnePass *onePassProg = nil // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog diff --git a/libgo/go/regexp/onepass_test.go b/libgo/go/regexp/onepass_test.go index 7b2beea..8202ebe 100644 --- a/libgo/go/regexp/onepass_test.go +++ b/libgo/go/regexp/onepass_test.go @@ -140,47 +140,41 @@ var onePass = &onePassProg{} var onePassTests = []struct { re string onePass *onePassProg - prog string }{ - {`^(?:a|(?:a*))$`, notOnePass, noStr}, - {`^(?:(a)|(?:a*))$`, notOnePass, noStr}, - {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`}, - {`^abcd$`, onePass, `abcd`}, - {`^abcd$`, onePass, `abcde`}, - {`^(?:(?:a{0,})*?)$`, onePass, `a`}, - {`^(?:(?:a+)*)$`, onePass, ``}, - {`^(?:(?:a|(?:aa)))$`, onePass, ``}, - {`^(?:[^\s\S])$`, onePass, ``}, - {`^(?:(?:a{3,4}){0,})$`, notOnePass, `aaaaaa`}, - {`^(?:(?:a+)*)$`, onePass, `a`}, - {`^(?:(?:(?:a*)+))$`, onePass, noStr}, - {`^(?:(?:a+)*)$`, onePass, ``}, - {`^[a-c]+$`, onePass, `abc`}, - {`^[a-c]*$`, onePass, `abcdabc`}, - {`^(?:a*)$`, onePass, `aaaaaaa`}, - {`^(?:(?:aa)|a)$`, onePass, `a`}, - {`^[a-c]*`, notOnePass, `abcdabc`}, - {`^[a-c]*$`, onePass, `abc`}, - {`^...$`, onePass, ``}, - {`^(?:a|(?:aa))$`, onePass, `a`}, - {`^[a-c]*`, notOnePass, `abcabc`}, - {`^a((b))c$`, onePass, noStr}, - {`^a.[l-nA-Cg-j]?e$`, onePass, noStr}, - {`^a((b))$`, onePass, noStr}, - {`^a(?:(b)|(c))c$`, onePass, noStr}, - {`^a(?:(b*)|(c))c$`, notOnePass, noStr}, - {`^a(?:b|c)$`, onePass, noStr}, - {`^a(?:b?|c)$`, onePass, noStr}, - {`^a(?:b?|c?)$`, notOnePass, noStr}, - {`^a(?:b?|c+)$`, onePass, noStr}, - {`^a(?:b+|(bc))d$`, notOnePass, noStr}, - {`^a(?:bc)+$`, onePass, noStr}, - {`^a(?:[bcd])+$`, onePass, noStr}, - {`^a((?:[bcd])+)$`, onePass, noStr}, - {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`}, - {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`}, - {`^(?:(?:aa)|.)$`, notOnePass, `a`}, - {`^(?:(?:a{1,2}){1,2})$`, notOnePass, `aaaa`}, + {`^(?:a|(?:a*))$`, notOnePass}, + {`^(?:(a)|(?:a*))$`, notOnePass}, + {`^(?:(?:(?:.(?:$))?))$`, onePass}, + {`^abcd$`, onePass}, + {`^(?:(?:a{0,})*?)$`, onePass}, + {`^(?:(?:a+)*)$`, onePass}, + {`^(?:(?:a|(?:aa)))$`, onePass}, + {`^(?:[^\s\S])$`, onePass}, + {`^(?:(?:a{3,4}){0,})$`, notOnePass}, + {`^(?:(?:(?:a*)+))$`, onePass}, + {`^[a-c]+$`, onePass}, + {`^[a-c]*$`, onePass}, + {`^(?:a*)$`, onePass}, + {`^(?:(?:aa)|a)$`, onePass}, + {`^[a-c]*`, notOnePass}, + {`^...$`, onePass}, + {`^(?:a|(?:aa))$`, onePass}, + {`^a((b))c$`, onePass}, + {`^a.[l-nA-Cg-j]?e$`, onePass}, + {`^a((b))$`, onePass}, + {`^a(?:(b)|(c))c$`, onePass}, + {`^a(?:(b*)|(c))c$`, notOnePass}, + {`^a(?:b|c)$`, onePass}, + {`^a(?:b?|c)$`, onePass}, + {`^a(?:b?|c?)$`, notOnePass}, + {`^a(?:b?|c+)$`, onePass}, + {`^a(?:b+|(bc))d$`, notOnePass}, + {`^a(?:bc)+$`, onePass}, + {`^a(?:[bcd])+$`, onePass}, + {`^a((?:[bcd])+)$`, onePass}, + {`^a(:?b|c)*d$`, onePass}, + {`^.bc(d|e)*$`, onePass}, + {`^(?:(?:aa)|.)$`, notOnePass}, + {`^(?:(?:a{1,2}){1,2})$`, notOnePass}, } func TestCompileOnePass(t *testing.T) { @@ -206,3 +200,28 @@ func TestCompileOnePass(t *testing.T) { } } } + +// TODO(cespare): Unify with onePassTests and rationalize one-pass test cases. +var onePassTests1 = []struct { + re string + match string +}{ + {`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905 +} + +func TestRunOnePass(t *testing.T) { + for _, test := range onePassTests1 { + re, err := Compile(test.re) + if err != nil { + t.Errorf("Compile(%q): got err: %s", test.re, err) + continue + } + if re.onepass == notOnePass { + t.Errorf("Compile(%q): got notOnePass, want one-pass", test.re) + continue + } + if !re.MatchString(test.match) { + t.Errorf("onepass %q did not match %q", test.re, test.match) + } + } +} diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go index 4e4b412..d7d0edb 100644 --- a/libgo/go/regexp/regexp.go +++ b/libgo/go/regexp/regexp.go @@ -104,6 +104,17 @@ func (re *Regexp) String() string { return re.expr } +// Copy returns a new Regexp object copied from re. +// +// When using a Regexp in multiple goroutines, giving each goroutine +// its own copy helps to avoid lock contention. +func (re *Regexp) Copy() *Regexp { + r := *re + r.mu = sync.Mutex{} + r.machine = nil + return &r +} + // Compile parses a regular expression and returns, if successful, // a Regexp object that can be used to match against text. // @@ -482,6 +493,10 @@ func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst } else { endPos = len(src) } + if nmatch > re.prog.NumCap { + nmatch = re.prog.NumCap + } + for searchPos <= endPos { a := re.doExecute(nil, bsrc, src, searchPos, nmatch) if len(a) == 0 { diff --git a/libgo/go/regexp/syntax/parse.go b/libgo/go/regexp/syntax/parse.go index d579a40..f38bbf6 100644 --- a/libgo/go/regexp/syntax/parse.go +++ b/libgo/go/regexp/syntax/parse.go @@ -470,9 +470,14 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { } sub = out - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. + // Round 2: Factor out common simple prefixes, + // just the first piece of each concatenation. + // This will be good enough a lot of the time. + // + // Complex subexpressions (e.g. involving quantifiers) + // are not safe to factor because that collapses their + // distinct paths through the automaton, which affects + // correctness in some cases. start = 0 out = sub[:0] var first *Regexp @@ -485,7 +490,9 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { var ifirst *Regexp if i < len(sub) { ifirst = p.leadingRegexp(sub[i]) - if first != nil && first.Equal(ifirst) { + if first != nil && first.Equal(ifirst) && + // first must be a character class OR a fixed repeat of a character class. + (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { continue } } @@ -830,7 +837,14 @@ func Parse(s string, flags Flags) (*Regexp, error) { lit = t[2:i] t = t[i+2:] } - p.push(literalRegexp(lit, p.flags)) + for lit != "" { + c, rest, err := nextRune(lit) + if err != nil { + return nil, err + } + p.literal(c) + lit = rest + } break BigSwitch case 'z': p.op(OpEndText) diff --git a/libgo/go/regexp/syntax/parse_test.go b/libgo/go/regexp/syntax/parse_test.go index c4a1117..5ca54bb 100644 --- a/libgo/go/regexp/syntax/parse_test.go +++ b/libgo/go/regexp/syntax/parse_test.go @@ -144,6 +144,7 @@ var parseTests = []parseTest{ // Test Perl quoted literals {`\Q+|*?{[\E`, `str{+|*?{[}`}, {`\Q+\E+`, `plus{lit{+}}`}, + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, {`\Q\\E`, `lit{\}`}, {`\Q\\\E`, `str{\\}`}, @@ -171,7 +172,7 @@ var parseTests = []parseTest{ // Factoring. {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, - {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, // Bug fixes. {`(?:.)`, `dot{}`}, @@ -194,12 +195,13 @@ var parseTests = []parseTest{ {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, - {`(?:xx|yy)c|(?:xx|yy)d`, - `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`}, + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, {`x{2}|x{2}[0-9]`, `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, {`x{2}y|x{2}[0-9]y`, `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + {`a.*?c|a.*?b`, + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, // Valid repetitions. {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, @@ -479,6 +481,7 @@ var invalidRegexps = []string{ `a{100000}`, `a{100000,}`, "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + `\Q\E*`, } var onlyPerl = []string{ diff --git a/libgo/go/regexp/syntax/regexp.go b/libgo/go/regexp/syntax/regexp.go index cea7d9e..75822cf 100644 --- a/libgo/go/regexp/syntax/regexp.go +++ b/libgo/go/regexp/syntax/regexp.go @@ -166,9 +166,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) { case OpAnyChar: b.WriteString(`(?s:.)`) case OpBeginLine: - b.WriteRune('^') + b.WriteString(`(?m:^)`) case OpEndLine: - b.WriteRune('$') + b.WriteString(`(?m:$)`) case OpBeginText: b.WriteString(`\A`) case OpEndText: diff --git a/libgo/go/regexp/syntax/simplify_test.go b/libgo/go/regexp/syntax/simplify_test.go index 879eff5..5d0f1de 100644 --- a/libgo/go/regexp/syntax/simplify_test.go +++ b/libgo/go/regexp/syntax/simplify_test.go @@ -19,8 +19,8 @@ var simplifyTests = []struct { {`(ab)+`, `(ab)+`}, {`(ab)?`, `(ab)?`}, {`.`, `(?s:.)`}, - {`^`, `^`}, - {`$`, `$`}, + {`^`, `(?m:^)`}, + {`$`, `(?m:$)`}, {`[ac]`, `[ac]`}, {`[^ac]`, `[^ac]`}, diff --git a/libgo/go/regexp/testdata/re2-search.txt b/libgo/go/regexp/testdata/re2-search.txt index f648e55..4d02e9c 100644 --- a/libgo/go/regexp/testdata/re2-search.txt +++ b/libgo/go/regexp/testdata/re2-search.txt @@ -3665,3 +3665,8 @@ regexps "(?:a\\C*|ba\\C)$" -;-;-;- -;1-4;-;1-4 +strings +"abc" +regexps +"a.*?c|a.*?b" +0-3;0-3;0-3;0-3 |