diff options
author | Ian Lance Taylor <ian@gcc.gnu.org> | 2011-10-26 23:57:58 +0000 |
---|---|---|
committer | Ian Lance Taylor <ian@gcc.gnu.org> | 2011-10-26 23:57:58 +0000 |
commit | d8f412571f8768df2d3239e72392dfeabbad1559 (patch) | |
tree | 19d182df05ead7ff8ba7ee00a7d57555e1383fdf /libgo/go/exp/regexp | |
parent | e0c39d66d4f0607177b1cf8995dda56a667e07b3 (diff) | |
download | gcc-d8f412571f8768df2d3239e72392dfeabbad1559.zip gcc-d8f412571f8768df2d3239e72392dfeabbad1559.tar.gz gcc-d8f412571f8768df2d3239e72392dfeabbad1559.tar.bz2 |
Update Go library to last weekly.
From-SVN: r180552
Diffstat (limited to 'libgo/go/exp/regexp')
-rw-r--r-- | libgo/go/exp/regexp/all_test.go | 429 | ||||
-rw-r--r-- | libgo/go/exp/regexp/exec.go | 295 | ||||
-rw-r--r-- | libgo/go/exp/regexp/find_test.go | 472 | ||||
-rw-r--r-- | libgo/go/exp/regexp/regexp.go | 795 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/compile.go | 269 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/parse.go | 1797 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/parse_test.go | 350 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/perl_groups.go | 130 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/prog.go | 237 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/prog_test.go | 91 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/regexp.go | 284 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/simplify.go | 151 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/simplify_test.go | 151 |
13 files changed, 0 insertions, 5451 deletions
diff --git a/libgo/go/exp/regexp/all_test.go b/libgo/go/exp/regexp/all_test.go deleted file mode 100644 index 77f32ca..0000000 --- a/libgo/go/exp/regexp/all_test.go +++ /dev/null @@ -1,429 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package regexp - -import ( - "os" - "strings" - "testing" -) - -var good_re = []string{ - ``, - `.`, - `^.$`, - `a`, - `a*`, - `a+`, - `a?`, - `a|b`, - `a*|b*`, - `(a*|b)(c*|d)`, - `[a-z]`, - `[a-abc-c\-\]\[]`, - `[a-z]+`, - `[abc]`, - `[^1234]`, - `[^\n]`, - `\!\\`, -} - -/* -type stringError struct { - re string - err os.Error -} - -var bad_re = []stringError{ - {`*`, ErrBareClosure}, - {`+`, ErrBareClosure}, - {`?`, ErrBareClosure}, - {`(abc`, ErrUnmatchedLpar}, - {`abc)`, ErrUnmatchedRpar}, - {`x[a-z`, ErrUnmatchedLbkt}, - {`abc]`, ErrUnmatchedRbkt}, - {`[z-a]`, ErrBadRange}, - {`abc\`, ErrExtraneousBackslash}, - {`a**`, ErrBadClosure}, - {`a*+`, ErrBadClosure}, - {`a??`, ErrBadClosure}, - {`\x`, ErrBadBackslash}, -} -*/ - -func compileTest(t *testing.T, expr string, error os.Error) *Regexp { - re, err := Compile(expr) - if err != error { - t.Error("compiling `", expr, "`; unexpected error: ", err.String()) - } - return re -} - -func TestGoodCompile(t *testing.T) { - for i := 0; i < len(good_re); i++ { - compileTest(t, good_re[i], nil) - } -} - -/* -func TestBadCompile(t *testing.T) { - for i := 0; i < len(bad_re); i++ { - compileTest(t, bad_re[i].re, bad_re[i].err) - } -} -*/ - -func matchTest(t *testing.T, test *FindTest) { - re := compileTest(t, test.pat, nil) - if re == nil { - return - } - m := re.MatchString(test.text) - if m != (len(test.matches) > 0) { - t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) - } - // now try bytes - m = re.Match([]byte(test.text)) - if m != (len(test.matches) > 0) { - t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) - } -} - -func TestMatch(t *testing.T) { - for _, test := range findTests { - matchTest(t, &test) - } -} - -func matchFunctionTest(t *testing.T, test *FindTest) { - m, err := MatchString(test.pat, test.text) - if err == nil { - return - } - if m != (len(test.matches) > 0) { - t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) - } -} - -func TestMatchFunction(t *testing.T) { - for _, test := range findTests { - matchFunctionTest(t, &test) - } -} - -type ReplaceTest struct { - pattern, replacement, input, output string -} - -var replaceTests = []ReplaceTest{ - // Test empty input and/or replacement, with pattern that matches the empty string. - {"", "", "", ""}, - {"", "x", "", "x"}, - {"", "", "abc", "abc"}, - {"", "x", "abc", "xaxbxcx"}, - - // Test empty input and/or replacement, with pattern that does not match the empty string. - {"b", "", "", ""}, - {"b", "x", "", ""}, - {"b", "", "abc", "ac"}, - {"b", "x", "abc", "axc"}, - {"y", "", "", ""}, - {"y", "x", "", ""}, - {"y", "", "abc", "abc"}, - {"y", "x", "abc", "abc"}, - - // Multibyte characters -- verify that we don't try to match in the middle - // of a character. - {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, - {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, - - // Start and end of a string. - {"^[a-c]*", "x", "abcdabc", "xdabc"}, - {"[a-c]*$", "x", "abcdabc", "abcdx"}, - {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, - {"^[a-c]*", "x", "abc", "x"}, - {"[a-c]*$", "x", "abc", "x"}, - {"^[a-c]*$", "x", "abc", "x"}, - {"^[a-c]*", "x", "dabce", "xdabce"}, - {"[a-c]*$", "x", "dabce", "dabcex"}, - {"^[a-c]*$", "x", "dabce", "dabce"}, - {"^[a-c]*", "x", "", "x"}, - {"[a-c]*$", "x", "", "x"}, - {"^[a-c]*$", "x", "", "x"}, - - {"^[a-c]+", "x", "abcdabc", "xdabc"}, - {"[a-c]+$", "x", "abcdabc", "abcdx"}, - {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, - {"^[a-c]+", "x", "abc", "x"}, - {"[a-c]+$", "x", "abc", "x"}, - {"^[a-c]+$", "x", "abc", "x"}, - {"^[a-c]+", "x", "dabce", "dabce"}, - {"[a-c]+$", "x", "dabce", "dabce"}, - {"^[a-c]+$", "x", "dabce", "dabce"}, - {"^[a-c]+", "x", "", ""}, - {"[a-c]+$", "x", "", ""}, - {"^[a-c]+$", "x", "", ""}, - - // Other cases. - {"abc", "def", "abcdefg", "defdefg"}, - {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, - {"abc", "", "abcdabc", "d"}, - {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, - {"abc", "d", "", ""}, - {"abc", "d", "abc", "d"}, - {".+", "x", "abc", "x"}, - {"[a-c]*", "x", "def", "xdxexfx"}, - {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, - {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, -} - -type ReplaceFuncTest struct { - pattern string - replacement func(string) string - input, output string -} - -var replaceFuncTests = []ReplaceFuncTest{ - {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, - {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, - {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, -} - -func TestReplaceAll(t *testing.T) { - for _, tc := range replaceTests { - re, err := Compile(tc.pattern) - if err != nil { - t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) - continue - } - actual := re.ReplaceAllString(tc.input, tc.replacement) - if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", - tc.pattern, tc.input, tc.replacement, actual, tc.output) - } - // now try bytes - actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) - if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", - tc.pattern, tc.input, tc.replacement, actual, tc.output) - } - } -} - -func TestReplaceAllFunc(t *testing.T) { - for _, tc := range replaceFuncTests { - re, err := Compile(tc.pattern) - if err != nil { - t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) - continue - } - actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) - if actual != tc.output { - t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", - tc.pattern, tc.input, tc.replacement, actual, tc.output) - } - // now try bytes - actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) - if actual != tc.output { - t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", - tc.pattern, tc.input, tc.replacement, actual, tc.output) - } - } -} - -type MetaTest struct { - pattern, output, literal string - isLiteral bool -} - -var metaTests = []MetaTest{ - {``, ``, ``, true}, - {`foo`, `foo`, `foo`, true}, - {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator - {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators - {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, -} - -func TestQuoteMeta(t *testing.T) { - for _, tc := range metaTests { - // Verify that QuoteMeta returns the expected string. - quoted := QuoteMeta(tc.pattern) - if quoted != tc.output { - t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", - tc.pattern, quoted, tc.output) - continue - } - - // Verify that the quoted string is in fact treated as expected - // by Compile -- i.e. that it matches the original, unquoted string. - if tc.pattern != "" { - re, err := Compile(quoted) - if err != nil { - t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) - continue - } - src := "abc" + tc.pattern + "def" - repl := "xyz" - replaced := re.ReplaceAllString(src, repl) - expected := "abcxyzdef" - if replaced != expected { - t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", - tc.pattern, src, repl, replaced, expected) - } - } - } -} - -func TestLiteralPrefix(t *testing.T) { - for _, tc := range metaTests { - // Literal method needs to scan the pattern. - re := MustCompile(tc.pattern) - str, complete := re.LiteralPrefix() - if complete != tc.isLiteral { - t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) - } - if str != tc.literal { - t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) - } - } -} - -type numSubexpCase struct { - input string - expected int -} - -var numSubexpCases = []numSubexpCase{ - {``, 0}, - {`.*`, 0}, - {`abba`, 0}, - {`ab(b)a`, 1}, - {`ab(.*)a`, 1}, - {`(.*)ab(.*)a`, 2}, - {`(.*)(ab)(.*)a`, 3}, - {`(.*)((a)b)(.*)a`, 4}, - {`(.*)(\(ab)(.*)a`, 3}, - {`(.*)(\(a\)b)(.*)a`, 3}, -} - -func TestNumSubexp(t *testing.T) { - for _, c := range numSubexpCases { - re := MustCompile(c.input) - n := re.NumSubexp() - if n != c.expected { - t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) - } - } -} - -func BenchmarkLiteral(b *testing.B) { - x := strings.Repeat("x", 50) + "y" - b.StopTimer() - re := MustCompile("y") - b.StartTimer() - for i := 0; i < b.N; i++ { - if !re.MatchString(x) { - println("no match!") - break - } - } -} - -func BenchmarkNotLiteral(b *testing.B) { - x := strings.Repeat("x", 50) + "y" - b.StopTimer() - re := MustCompile(".y") - b.StartTimer() - for i := 0; i < b.N; i++ { - if !re.MatchString(x) { - println("no match!") - break - } - } -} - -func BenchmarkMatchClass(b *testing.B) { - b.StopTimer() - x := strings.Repeat("xxxx", 20) + "w" - re := MustCompile("[abcdw]") - b.StartTimer() - for i := 0; i < b.N; i++ { - if !re.MatchString(x) { - println("no match!") - break - } - } -} - -func BenchmarkMatchClass_InRange(b *testing.B) { - b.StopTimer() - // 'b' is between 'a' and 'c', so the charclass - // range checking is no help here. - x := strings.Repeat("bbbb", 20) + "c" - re := MustCompile("[ac]") - b.StartTimer() - for i := 0; i < b.N; i++ { - if !re.MatchString(x) { - println("no match!") - break - } - } -} - -func BenchmarkReplaceAll(b *testing.B) { - x := "abcdefghijklmnopqrstuvwxyz" - b.StopTimer() - re := MustCompile("[cjrw]") - b.StartTimer() - for i := 0; i < b.N; i++ { - re.ReplaceAllString(x, "") - } -} - -func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { - b.StopTimer() - x := []byte("abcdefghijklmnopqrstuvwxyz") - re := MustCompile("^zbc(d|e)") - b.StartTimer() - for i := 0; i < b.N; i++ { - re.Match(x) - } -} - -func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { - b.StopTimer() - x := []byte("abcdefghijklmnopqrstuvwxyz") - for i := 0; i < 15; i++ { - x = append(x, x...) - } - re := MustCompile("^zbc(d|e)") - b.StartTimer() - for i := 0; i < b.N; i++ { - re.Match(x) - } -} - -func BenchmarkAnchoredShortMatch(b *testing.B) { - b.StopTimer() - x := []byte("abcdefghijklmnopqrstuvwxyz") - re := MustCompile("^.bc(d|e)") - b.StartTimer() - for i := 0; i < b.N; i++ { - re.Match(x) - } -} - -func BenchmarkAnchoredLongMatch(b *testing.B) { - b.StopTimer() - x := []byte("abcdefghijklmnopqrstuvwxyz") - for i := 0; i < 15; i++ { - x = append(x, x...) - } - re := MustCompile("^.bc(d|e)") - b.StartTimer() - for i := 0; i < b.N; i++ { - re.Match(x) - } -} diff --git a/libgo/go/exp/regexp/exec.go b/libgo/go/exp/regexp/exec.go deleted file mode 100644 index 0670bb9..0000000 --- a/libgo/go/exp/regexp/exec.go +++ /dev/null @@ -1,295 +0,0 @@ -package regexp - -import "exp/regexp/syntax" - -// A queue is a 'sparse array' holding pending threads of execution. -// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html -type queue struct { - sparse []uint32 - dense []entry -} - -// A entry is an entry on a queue. -// It holds both the instruction pc and the actual thread. -// Some queue entries are just place holders so that the machine -// knows it has considered that pc. Such entries have t == nil. -type entry struct { - pc uint32 - t *thread -} - -// A thread is the state of a single path through the machine: -// an instruction and a corresponding capture array. -// See http://swtch.com/~rsc/regexp/regexp2.html -type thread struct { - inst *syntax.Inst - cap []int -} - -// A machine holds all the state during an NFA simulation for p. -type machine struct { - re *Regexp // corresponding Regexp - p *syntax.Prog // compiled program - q0, q1 queue // two queues for runq, nextq - pool []*thread // pool of available threads - matched bool // whether a match was found - matchcap []int // capture information for the match -} - -// progMachine returns a new machine running the prog p. -func progMachine(p *syntax.Prog) *machine { - m := &machine{p: p} - n := len(m.p.Inst) - m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} - m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} - ncap := p.NumCap - if ncap < 2 { - ncap = 2 - } - m.matchcap = make([]int, ncap) - return m -} - -// alloc allocates a new thread with the given instruction. -// It uses the free pool if possible. -func (m *machine) alloc(i *syntax.Inst) *thread { - var t *thread - if n := len(m.pool); n > 0 { - t = m.pool[n-1] - m.pool = m.pool[:n-1] - } else { - t = new(thread) - t.cap = make([]int, cap(m.matchcap)) - } - t.cap = t.cap[:len(m.matchcap)] - t.inst = i - return t -} - -// free returns t to the free pool. -func (m *machine) free(t *thread) { - m.pool = append(m.pool, t) -} - -// match runs the machine over the input starting at pos. -// It reports whether a match was found. -// If so, m.matchcap holds the submatch information. -func (m *machine) match(i input, pos int) bool { - startCond := m.re.cond - if startCond == ^syntax.EmptyOp(0) { // impossible - return false - } - m.matched = false - for i := range m.matchcap { - m.matchcap[i] = -1 - } - runq, nextq := &m.q0, &m.q1 - rune, rune1 := endOfText, endOfText - width, width1 := 0, 0 - rune, width = i.step(pos) - if rune != endOfText { - rune1, width1 = i.step(pos + width) - } - // TODO: Let caller specify the initial flag setting. - // For now assume pos == 0 is beginning of text and - // pos != 0 is not even beginning of line. - // TODO: Word boundary. - var flag syntax.EmptyOp - if pos == 0 { - flag = syntax.EmptyBeginText | syntax.EmptyBeginLine - } - - // Update flag using lookahead rune. - if rune1 == '\n' { - flag |= syntax.EmptyEndLine - } - if rune1 == endOfText { - flag |= syntax.EmptyEndText - } - - for { - if len(runq.dense) == 0 { - if startCond&syntax.EmptyBeginText != 0 && pos != 0 { - // Anchored match, past beginning of text. - break - } - if m.matched { - // Have match; finished exploring alternatives. - break - } - if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() { - // Match requires literal prefix; fast search for it. - advance := i.index(m.re, pos) - if advance < 0 { - break - } - pos += advance - rune, width = i.step(pos) - rune1, width1 = i.step(pos + width) - } - } - if !m.matched { - if len(m.matchcap) > 0 { - m.matchcap[0] = pos - } - m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag) - } - // TODO: word boundary - flag = 0 - if rune == '\n' { - flag |= syntax.EmptyBeginLine - } - if rune1 == '\n' { - flag |= syntax.EmptyEndLine - } - if rune1 == endOfText { - flag |= syntax.EmptyEndText - } - m.step(runq, nextq, pos, pos+width, rune, flag) - if width == 0 { - break - } - pos += width - rune, width = rune1, width1 - if rune != endOfText { - rune1, width1 = i.step(pos + width) - } - runq, nextq = nextq, runq - } - m.clear(nextq) - return m.matched -} - -// clear frees all threads on the thread queue. -func (m *machine) clear(q *queue) { - for _, d := range q.dense { - if d.t != nil { - m.free(d.t) - } - } - q.dense = q.dense[:0] -} - -// step executes one step of the machine, running each of the threads -// on runq and appending new threads to nextq. -// The step processes the rune c (which may be endOfText), -// which starts at position pos and ends at nextPos. -// nextCond gives the setting for the empty-width flags after c. -func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) { - for j := 0; j < len(runq.dense); j++ { - d := &runq.dense[j] - t := d.t - if t == nil { - continue - } - /* - * If we support leftmost-longest matching: - if longest && matched && match[0] < t.cap[0] { - m.free(t) - continue - } - */ - - i := t.inst - switch i.Op { - default: - panic("bad inst") - - case syntax.InstMatch: - if len(t.cap) > 0 { - t.cap[1] = pos - copy(m.matchcap, t.cap) - } - m.matched = true - for _, d := range runq.dense[j+1:] { - if d.t != nil { - m.free(d.t) - } - } - runq.dense = runq.dense[:0] - - case syntax.InstRune: - if i.MatchRune(c) { - m.add(nextq, i.Out, nextPos, t.cap, nextCond) - } - } - m.free(t) - } - runq.dense = runq.dense[:0] -} - -// add adds an entry to q for pc, unless the q already has such an entry. -// It also recursively adds an entry for all instructions reachable from pc by following -// empty-width conditions satisfied by cond. pos gives the current position -// in the input. -func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) { - if pc == 0 { - return - } - if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { - return - } - - j := len(q.dense) - q.dense = q.dense[:j+1] - d := &q.dense[j] - d.t = nil - d.pc = pc - q.sparse[pc] = uint32(j) - - i := &m.p.Inst[pc] - switch i.Op { - default: - panic("unhandled") - case syntax.InstFail: - // nothing - case syntax.InstAlt, syntax.InstAltMatch: - m.add(q, i.Out, pos, cap, cond) - m.add(q, i.Arg, pos, cap, cond) - case syntax.InstEmptyWidth: - if syntax.EmptyOp(i.Arg)&^cond == 0 { - m.add(q, i.Out, pos, cap, cond) - } - case syntax.InstNop: - m.add(q, i.Out, pos, cap, cond) - case syntax.InstCapture: - if int(i.Arg) < len(cap) { - opos := cap[i.Arg] - cap[i.Arg] = pos - m.add(q, i.Out, pos, cap, cond) - cap[i.Arg] = opos - } else { - m.add(q, i.Out, pos, cap, cond) - } - case syntax.InstMatch, syntax.InstRune: - t := m.alloc(i) - if len(t.cap) > 0 { - copy(t.cap, cap) - } - d.t = t - } -} - -// empty is a non-nil 0-element slice, -// so doExecute can avoid an allocation -// when 0 captures are requested from a successful match. -var empty = make([]int, 0) - -// doExecute finds the leftmost match in the input and returns -// the position of its subexpressions. -func (re *Regexp) doExecute(i input, pos int, ncap int) []int { - m := re.get() - m.matchcap = m.matchcap[:ncap] - if !m.match(i, pos) { - re.put(m) - return nil - } - if ncap == 0 { - re.put(m) - return empty // empty but not nil - } - cap := make([]int, ncap) - copy(cap, m.matchcap) - re.put(m) - return cap -} diff --git a/libgo/go/exp/regexp/find_test.go b/libgo/go/exp/regexp/find_test.go deleted file mode 100644 index dddc348..0000000 --- a/libgo/go/exp/regexp/find_test.go +++ /dev/null @@ -1,472 +0,0 @@ -// Copyright 2010 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package regexp - -import ( - "fmt" - "strings" - "testing" -) - -// For each pattern/text pair, what is the expected output of each function? -// We can derive the textual results from the indexed results, the non-submatch -// results from the submatched results, the single results from the 'all' results, -// and the byte results from the string results. Therefore the table includes -// only the FindAllStringSubmatchIndex result. -type FindTest struct { - pat string - text string - matches [][]int -} - -func (t FindTest) String() string { - return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) -} - -var findTests = []FindTest{ - {``, ``, build(1, 0, 0)}, - {`^abcdefg`, "abcdefg", build(1, 0, 7)}, - {`a+`, "baaab", build(1, 1, 4)}, - {"abcd..", "abcdef", build(1, 0, 6)}, - {`a`, "a", build(1, 0, 1)}, - {`x`, "y", nil}, - {`b`, "abc", build(1, 1, 2)}, - {`.`, "a", build(1, 0, 1)}, - {`.*`, "abcdef", build(1, 0, 6)}, - {`^`, "abcde", build(1, 0, 0)}, - {`$`, "abcde", build(1, 5, 5)}, - {`^abcd$`, "abcd", build(1, 0, 4)}, - {`^bcd'`, "abcdef", nil}, - {`^abcd$`, "abcde", nil}, - {`a+`, "baaab", build(1, 1, 4)}, - {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, - {`[a-z]+`, "abcd", build(1, 0, 4)}, - {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, - {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, - {`[^\n]+`, "abcd\n", build(1, 0, 4)}, - {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, - {`日本語+`, "日本語", build(1, 0, 9)}, - {`日本語+`, "日本語語語語", build(1, 0, 18)}, - {`()`, "", build(1, 0, 0, 0, 0)}, - {`(a)`, "a", build(1, 0, 1, 0, 1)}, - {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, - {`(.*)`, "", build(1, 0, 0, 0, 0)}, - {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, - {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, - {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, - {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, - {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, - {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, - {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, - - {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, - {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, - {`[.]`, ".", build(1, 0, 1)}, - {`/$`, "/abc/", build(1, 4, 5)}, - {`/$`, "/abc", nil}, - - // multiple matches - {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, - {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, - {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, - {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, - {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, - - // fixed bugs - {`ab$`, "cab", build(1, 1, 3)}, - {`axxb$`, "axxcb", nil}, - {`data`, "daXY data", build(1, 5, 9)}, - {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, - {`zx+`, "zzx", build(1, 1, 3)}, - - // can backslash-escape any punctuation - {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, - `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, - {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, - `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, - {"\\`", "`", build(1, 0, 1)}, - {"[\\`]+", "`", build(1, 0, 1)}, - - // long set of matches (longer than startSize) - { - ".", - "qwertyuiopasdfghjklzxcvbnm1234567890", - build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, - 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, - 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), - }, -} - -// build is a helper to construct a [][]int by extracting n sequences from x. -// This represents n matches with len(x)/n submatches each. -func build(n int, x ...int) [][]int { - ret := make([][]int, n) - runLength := len(x) / n - j := 0 - for i := range ret { - ret[i] = make([]int, runLength) - copy(ret[i], x[j:]) - j += runLength - if j > len(x) { - panic("invalid build entry") - } - } - return ret -} - -// First the simple cases. - -func TestFind(t *testing.T) { - for _, test := range findTests { - re := MustCompile(test.pat) - if re.String() != test.pat { - t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) - } - result := re.Find([]byte(test.text)) - switch { - case len(test.matches) == 0 && len(result) == 0: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - expect := test.text[test.matches[0][0]:test.matches[0][1]] - if expect != string(result) { - t.Errorf("expected %q got %q: %s", expect, result, test) - } - } - } -} - -func TestFindString(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindString(test.text) - switch { - case len(test.matches) == 0 && len(result) == 0: - // ok - case test.matches == nil && result != "": - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == "": - // Tricky because an empty result has two meanings: no match or empty match. - if test.matches[0][0] != test.matches[0][1] { - t.Errorf("expected match; got none: %s", test) - } - case test.matches != nil && result != "": - expect := test.text[test.matches[0][0]:test.matches[0][1]] - if expect != result { - t.Errorf("expected %q got %q: %s", expect, result, test) - } - } - } -} - -func testFindIndex(test *FindTest, result []int, t *testing.T) { - switch { - case len(test.matches) == 0 && len(result) == 0: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - expect := test.matches[0] - if expect[0] != result[0] || expect[1] != result[1] { - t.Errorf("expected %v got %v: %s", expect, result, test) - } - } -} - -func TestFindIndex(t *testing.T) { - for _, test := range findTests { - testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) - } -} - -func TestFindStringIndex(t *testing.T) { - for _, test := range findTests { - testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) - } -} - -func TestFindReaderIndex(t *testing.T) { - for _, test := range findTests { - testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) - } -} - -// Now come the simple All cases. - -func TestFindAll(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindAll([]byte(test.text), -1) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Fatalf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - if len(test.matches) != len(result) { - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - continue - } - for k, e := range test.matches { - expect := test.text[e[0]:e[1]] - if expect != string(result[k]) { - t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) - } - } - } - } -} - -func TestFindAllString(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindAllString(test.text, -1) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - if len(test.matches) != len(result) { - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - continue - } - for k, e := range test.matches { - expect := test.text[e[0]:e[1]] - if expect != result[k] { - t.Errorf("expected %q got %q: %s", expect, result, test) - } - } - } - } -} - -func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - if len(test.matches) != len(result) { - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - return - } - for k, e := range test.matches { - if e[0] != result[k][0] || e[1] != result[k][1] { - t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) - } - } - } -} - -func TestFindAllIndex(t *testing.T) { - for _, test := range findTests { - testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) - } -} - -func TestFindAllStringIndex(t *testing.T) { - for _, test := range findTests { - testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) - } -} - -// Now come the Submatch cases. - -func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { - if len(submatches) != len(result)*2 { - t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) - return - } - for k := 0; k < len(submatches); k += 2 { - if submatches[k] == -1 { - if result[k/2] != nil { - t.Errorf("match %d: expected nil got %q: %s", n, result, test) - } - continue - } - expect := test.text[submatches[k]:submatches[k+1]] - if expect != string(result[k/2]) { - t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) - return - } - } -} - -func TestFindSubmatch(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - testSubmatchBytes(&test, 0, test.matches[0], result, t) - } - } -} - -func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { - if len(submatches) != len(result)*2 { - t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) - return - } - for k := 0; k < len(submatches); k += 2 { - if submatches[k] == -1 { - if result[k/2] != "" { - t.Errorf("match %d: expected nil got %q: %s", n, result, test) - } - continue - } - expect := test.text[submatches[k]:submatches[k+1]] - if expect != result[k/2] { - t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) - return - } - } -} - -func TestFindStringSubmatch(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindStringSubmatch(test.text) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - testSubmatchString(&test, 0, test.matches[0], result, t) - } - } -} - -func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { - if len(expect) != len(result) { - t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) - return - } - for k, e := range expect { - if e != result[k] { - t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) - } - } -} - -func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case test.matches != nil && result != nil: - testSubmatchIndices(test, 0, test.matches[0], result, t) - } -} - -func TestFindSubmatchIndex(t *testing.T) { - for _, test := range findTests { - testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) - } -} - -func TestFindStringSubmatchIndex(t *testing.T) { - for _, test := range findTests { - testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) - } -} - -func TestFindReaderSubmatchIndex(t *testing.T) { - for _, test := range findTests { - testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) - } -} - -// Now come the monster AllSubmatch cases. - -func TestFindAllSubmatch(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case len(test.matches) != len(result): - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - case test.matches != nil && result != nil: - for k, match := range test.matches { - testSubmatchBytes(&test, k, match, result[k], t) - } - } - } -} - -func TestFindAllStringSubmatch(t *testing.T) { - for _, test := range findTests { - result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case len(test.matches) != len(result): - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - case test.matches != nil && result != nil: - for k, match := range test.matches { - testSubmatchString(&test, k, match, result[k], t) - } - } - } -} - -func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { - switch { - case test.matches == nil && result == nil: - // ok - case test.matches == nil && result != nil: - t.Errorf("expected no match; got one: %s", test) - case test.matches != nil && result == nil: - t.Errorf("expected match; got none: %s", test) - case len(test.matches) != len(result): - t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) - case test.matches != nil && result != nil: - for k, match := range test.matches { - testSubmatchIndices(test, k, match, result[k], t) - } - } -} - -func TestFindAllSubmatchIndex(t *testing.T) { - for _, test := range findTests { - testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) - } -} - -func TestFindAllStringSubmatchIndex(t *testing.T) { - for _, test := range findTests { - testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) - } -} diff --git a/libgo/go/exp/regexp/regexp.go b/libgo/go/exp/regexp/regexp.go deleted file mode 100644 index 1b75900..0000000 --- a/libgo/go/exp/regexp/regexp.go +++ /dev/null @@ -1,795 +0,0 @@ -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package regexp implements a simple regular expression library. -// -// The syntax of the regular expressions accepted is the same -// general syntax used by Perl, Python, and other languages. -// More precisely, it is the syntax accepted by RE2 and described at -// http://code.google.com/p/re2/wiki/Syntax, except for \C. -// -// All characters are UTF-8-encoded code points. -// -// There are 16 methods of Regexp that match a regular expression and identify -// the matched text. Their names are matched by this regular expression: -// -// Find(All)?(String)?(Submatch)?(Index)? -// -// If 'All' is present, the routine matches successive non-overlapping -// matches of the entire expression. Empty matches abutting a preceding -// match are ignored. The return value is a slice containing the successive -// return values of the corresponding non-'All' routine. These routines take -// an extra integer argument, n; if n >= 0, the function returns at most n -// matches/submatches. -// -// If 'String' is present, the argument is a string; otherwise it is a slice -// of bytes; return values are adjusted as appropriate. -// -// If 'Submatch' is present, the return value is a slice identifying the -// successive submatches of the expression. Submatches are matches of -// parenthesized subexpressions within the regular expression, numbered from -// left to right in order of opening parenthesis. Submatch 0 is the match of -// the entire expression, submatch 1 the match of the first parenthesized -// subexpression, and so on. -// -// If 'Index' is present, matches and submatches are identified by byte index -// pairs within the input string: result[2*n:2*n+1] identifies the indexes of -// the nth submatch. The pair for n==0 identifies the match of the entire -// expression. If 'Index' is not present, the match is identified by the -// text of the match/submatch. If an index is negative, it means that -// subexpression did not match any string in the input. -// -// There is also a subset of the methods that can be applied to text read -// from a RuneReader: -// -// MatchReader, FindReaderIndex, FindReaderSubmatchIndex -// -// This set may grow. Note that regular expression matches may need to -// examine text beyond the text returned by a match, so the methods that -// match text from a RuneReader may read arbitrarily far into the input -// before returning. -// -// (There are a few other methods that do not match this pattern.) -// -package regexp - -import ( - "bytes" - "exp/regexp/syntax" - "io" - "os" - "strings" - "sync" - "utf8" -) - -var debug = false - -// Error is the local type for a parsing error. -type Error string - -func (e Error) String() string { - return string(e) -} - -// Regexp is the representation of a compiled regular expression. -// The public interface is entirely through methods. -// A Regexp is safe for concurrent use by multiple goroutines. -type Regexp struct { - // read-only after Compile - expr string // as passed to Compile - prog *syntax.Prog // compiled program - prefix string // required prefix in unanchored matches - prefixBytes []byte // prefix, as a []byte - prefixComplete bool // prefix is the entire regexp - prefixRune int // first rune in prefix - cond syntax.EmptyOp // empty-width conditions required at start of match - - // cache of machines for running regexp - mu sync.Mutex - machine []*machine -} - -// String returns the source text used to compile the regular expression. -func (re *Regexp) String() string { - return re.expr -} - -// Compile parses a regular expression and returns, if successful, a Regexp -// object that can be used to match against text. -func Compile(expr string) (*Regexp, os.Error) { - re, err := syntax.Parse(expr, syntax.Perl) - if err != nil { - return nil, err - } - prog, err := syntax.Compile(re) - if err != nil { - return nil, err - } - regexp := &Regexp{ - expr: expr, - prog: prog, - } - regexp.prefix, regexp.prefixComplete = prog.Prefix() - if regexp.prefix != "" { - // TODO(rsc): Remove this allocation by adding - // IndexString to package bytes. - regexp.prefixBytes = []byte(regexp.prefix) - regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) - } - regexp.cond = prog.StartCond() - return regexp, nil -} - -// get returns a machine to use for matching re. -// It uses the re's machine cache if possible, to avoid -// unnecessary allocation. -func (re *Regexp) get() *machine { - re.mu.Lock() - if n := len(re.machine); n > 0 { - z := re.machine[n-1] - re.machine = re.machine[:n-1] - re.mu.Unlock() - return z - } - re.mu.Unlock() - z := progMachine(re.prog) - z.re = re - return z -} - -// put returns a machine to the re's machine cache. -// There is no attempt to limit the size of the cache, so it will -// grow to the maximum number of simultaneous matches -// run using re. (The cache empties when re gets garbage collected.) -func (re *Regexp) put(z *machine) { - re.mu.Lock() - re.machine = append(re.machine, z) - re.mu.Unlock() -} - -// MustCompile is like Compile but panics if the expression cannot be parsed. -// It simplifies safe initialization of global variables holding compiled regular -// expressions. -func MustCompile(str string) *Regexp { - regexp, error := Compile(str) - if error != nil { - panic(`regexp: compiling "` + str + `": ` + error.String()) - } - return regexp -} - -// NumSubexp returns the number of parenthesized subexpressions in this Regexp. -func (re *Regexp) NumSubexp() int { - // NumCap/2 because captures count ( and ) separately. - // -1 because NumCap counts $0 but NumSubexp does not. - return re.prog.NumCap/2 - 1 -} - -const endOfText = -1 - -// input abstracts different representations of the input text. It provides -// one-character lookahead. -type input interface { - step(pos int) (rune int, width int) // advance one rune - canCheckPrefix() bool // can we look ahead without losing info? - hasPrefix(re *Regexp) bool - index(re *Regexp, pos int) int -} - -// inputString scans a string. -type inputString struct { - str string -} - -func newInputString(str string) *inputString { - return &inputString{str: str} -} - -func (i *inputString) step(pos int) (int, int) { - if pos < len(i.str) { - return utf8.DecodeRuneInString(i.str[pos:len(i.str)]) - } - return endOfText, 0 -} - -func (i *inputString) canCheckPrefix() bool { - return true -} - -func (i *inputString) hasPrefix(re *Regexp) bool { - return strings.HasPrefix(i.str, re.prefix) -} - -func (i *inputString) index(re *Regexp, pos int) int { - return strings.Index(i.str[pos:], re.prefix) -} - -// inputBytes scans a byte slice. -type inputBytes struct { - str []byte -} - -func newInputBytes(str []byte) *inputBytes { - return &inputBytes{str: str} -} - -func (i *inputBytes) step(pos int) (int, int) { - if pos < len(i.str) { - return utf8.DecodeRune(i.str[pos:len(i.str)]) - } - return endOfText, 0 -} - -func (i *inputBytes) canCheckPrefix() bool { - return true -} - -func (i *inputBytes) hasPrefix(re *Regexp) bool { - return bytes.HasPrefix(i.str, re.prefixBytes) -} - -func (i *inputBytes) index(re *Regexp, pos int) int { - return bytes.Index(i.str[pos:], re.prefixBytes) -} - -// inputReader scans a RuneReader. -type inputReader struct { - r io.RuneReader - atEOT bool - pos int -} - -func newInputReader(r io.RuneReader) *inputReader { - return &inputReader{r: r} -} - -func (i *inputReader) step(pos int) (int, int) { - if !i.atEOT && pos != i.pos { - return endOfText, 0 - - } - r, w, err := i.r.ReadRune() - if err != nil { - i.atEOT = true - return endOfText, 0 - } - i.pos += w - return r, w -} - -func (i *inputReader) canCheckPrefix() bool { - return false -} - -func (i *inputReader) hasPrefix(re *Regexp) bool { - return false -} - -func (i *inputReader) index(re *Regexp, pos int) int { - return -1 -} - -// LiteralPrefix returns a literal string that must begin any match -// of the regular expression re. It returns the boolean true if the -// literal string comprises the entire regular expression. -func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { - return re.prefix, re.prefixComplete -} - -// MatchReader returns whether the Regexp matches the text read by the -// RuneReader. The return value is a boolean: true for match, false for no -// match. -func (re *Regexp) MatchReader(r io.RuneReader) bool { - return re.doExecute(newInputReader(r), 0, 0) != nil -} - -// MatchString returns whether the Regexp matches the string s. -// The return value is a boolean: true for match, false for no match. -func (re *Regexp) MatchString(s string) bool { - return re.doExecute(newInputString(s), 0, 0) != nil -} - -// Match returns whether the Regexp matches the byte slice b. -// The return value is a boolean: true for match, false for no match. -func (re *Regexp) Match(b []byte) bool { - return re.doExecute(newInputBytes(b), 0, 0) != nil -} - -// MatchReader checks whether a textual regular expression matches the text -// read by the RuneReader. More complicated queries need to use Compile and -// the full Regexp interface. -func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) { - re, err := Compile(pattern) - if err != nil { - return false, err - } - return re.MatchReader(r), nil -} - -// MatchString checks whether a textual regular expression -// matches a string. More complicated queries need -// to use Compile and the full Regexp interface. -func MatchString(pattern string, s string) (matched bool, error os.Error) { - re, err := Compile(pattern) - if err != nil { - return false, err - } - return re.MatchString(s), nil -} - -// Match checks whether a textual regular expression -// matches a byte slice. More complicated queries need -// to use Compile and the full Regexp interface. -func Match(pattern string, b []byte) (matched bool, error os.Error) { - re, err := Compile(pattern) - if err != nil { - return false, err - } - return re.Match(b), nil -} - -// ReplaceAllString returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllString(src, repl string) string { - return re.ReplaceAllStringFunc(src, func(string) string { return repl }) -} - -// ReplaceAllStringFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched string). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { - lastMatchEnd := 0 // end position of the most recent match - searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(newInputString(src), searchPos, 2) - if len(a) == 0 { - break // no more matches - } - - // Copy the unmatched characters before this match. - io.WriteString(buf, src[lastMatchEnd:a[0]]) - - // Now insert a copy of the replacement string, but not for a - // match of the empty string immediately after another match. - // (Otherwise, we get double replacement for patterns that - // match both empty and nonempty strings.) - if a[1] > lastMatchEnd || a[0] == 0 { - io.WriteString(buf, repl(src[a[0]:a[1]])) - } - lastMatchEnd = a[1] - - // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRuneInString(src[searchPos:]) - if searchPos+width > a[1] { - searchPos += width - } else if searchPos+1 > a[1] { - // This clause is only needed at the end of the input - // string. In that case, DecodeRuneInString returns width=0. - searchPos++ - } else { - searchPos = a[1] - } - } - - // Copy the unmatched characters after the last match. - io.WriteString(buf, src[lastMatchEnd:]) - - return buf.String() -} - -// ReplaceAll returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement text. -func (re *Regexp) ReplaceAll(src, repl []byte) []byte { - return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) -} - -// ReplaceAllFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched []byte). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { - lastMatchEnd := 0 // end position of the most recent match - searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(newInputBytes(src), searchPos, 2) - if len(a) == 0 { - break // no more matches - } - - // Copy the unmatched characters before this match. - buf.Write(src[lastMatchEnd:a[0]]) - - // Now insert a copy of the replacement string, but not for a - // match of the empty string immediately after another match. - // (Otherwise, we get double replacement for patterns that - // match both empty and nonempty strings.) - if a[1] > lastMatchEnd || a[0] == 0 { - buf.Write(repl(src[a[0]:a[1]])) - } - lastMatchEnd = a[1] - - // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRune(src[searchPos:]) - if searchPos+width > a[1] { - searchPos += width - } else if searchPos+1 > a[1] { - // This clause is only needed at the end of the input - // string. In that case, DecodeRuneInString returns width=0. - searchPos++ - } else { - searchPos = a[1] - } - } - - // Copy the unmatched characters after the last match. - buf.Write(src[lastMatchEnd:]) - - return buf.Bytes() -} - -var specialBytes = []byte(`\.+*?()|[]{}^$`) - -func special(b byte) bool { - return bytes.IndexByte(specialBytes, b) >= 0 -} - -// QuoteMeta returns a string that quotes all regular expression metacharacters -// inside the argument text; the returned string is a regular expression matching -// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. -func QuoteMeta(s string) string { - b := make([]byte, 2*len(s)) - - // A byte loop is correct because all metacharacters are ASCII. - j := 0 - for i := 0; i < len(s); i++ { - if special(s[i]) { - b[j] = '\\' - j++ - } - b[j] = s[i] - j++ - } - return string(b[0:j]) -} - -// Find matches in slice b if b is non-nil, otherwise find matches in string s. -func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { - var end int - if b == nil { - end = len(s) - } else { - end = len(b) - } - - for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { - var in input - if b == nil { - in = newInputString(s) - } else { - in = newInputBytes(b) - } - matches := re.doExecute(in, pos, re.prog.NumCap) - if len(matches) == 0 { - break - } - - accept := true - if matches[1] == pos { - // We've found an empty match. - if matches[0] == prevMatchEnd { - // We don't allow an empty match right - // after a previous match, so ignore it. - accept = false - } - var width int - // TODO: use step() - if b == nil { - _, width = utf8.DecodeRuneInString(s[pos:end]) - } else { - _, width = utf8.DecodeRune(b[pos:end]) - } - if width > 0 { - pos += width - } else { - pos = end + 1 - } - } else { - pos = matches[1] - } - prevMatchEnd = matches[1] - - if accept { - deliver(matches) - i++ - } - } -} - -// Find returns a slice holding the text of the leftmost match in b of the regular expression. -// A return value of nil indicates no match. -func (re *Regexp) Find(b []byte) []byte { - a := re.doExecute(newInputBytes(b), 0, 2) - if a == nil { - return nil - } - return b[a[0]:a[1]] -} - -// FindIndex returns a two-element slice of integers defining the location of -// the leftmost match in b of the regular expression. The match itself is at -// b[loc[0]:loc[1]]. -// A return value of nil indicates no match. -func (re *Regexp) FindIndex(b []byte) (loc []int) { - a := re.doExecute(newInputBytes(b), 0, 2) - if a == nil { - return nil - } - return a[0:2] -} - -// FindString returns a string holding the text of the leftmost match in s of the regular -// expression. If there is no match, the return value is an empty string, -// but it will also be empty if the regular expression successfully matches -// an empty string. Use FindStringIndex or FindStringSubmatch if it is -// necessary to distinguish these cases. -func (re *Regexp) FindString(s string) string { - a := re.doExecute(newInputString(s), 0, 2) - if a == nil { - return "" - } - return s[a[0]:a[1]] -} - -// FindStringIndex returns a two-element slice of integers defining the -// location of the leftmost match in s of the regular expression. The match -// itself is at s[loc[0]:loc[1]]. -// A return value of nil indicates no match. -func (re *Regexp) FindStringIndex(s string) []int { - a := re.doExecute(newInputString(s), 0, 2) - if a == nil { - return nil - } - return a[0:2] -} - -// FindReaderIndex returns a two-element slice of integers defining the -// location of the leftmost match of the regular expression in text read from -// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return -// value of nil indicates no match. -func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { - a := re.doExecute(newInputReader(r), 0, 2) - if a == nil { - return nil - } - return a[0:2] -} - -// FindSubmatch returns a slice of slices holding the text of the leftmost -// match of the regular expression in b and the matches, if any, of its -// subexpressions, as defined by the 'Submatch' descriptions in the package -// comment. -// A return value of nil indicates no match. -func (re *Regexp) FindSubmatch(b []byte) [][]byte { - a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap) - if a == nil { - return nil - } - ret := make([][]byte, len(a)/2) - for i := range ret { - if a[2*i] >= 0 { - ret[i] = b[a[2*i]:a[2*i+1]] - } - } - return ret -} - -// FindSubmatchIndex returns a slice holding the index pairs identifying the -// leftmost match of the regular expression in b and the matches, if any, of -// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions -// in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindSubmatchIndex(b []byte) []int { - return re.doExecute(newInputBytes(b), 0, re.prog.NumCap) -} - -// FindStringSubmatch returns a slice of strings holding the text of the -// leftmost match of the regular expression in s and the matches, if any, of -// its subexpressions, as defined by the 'Submatch' description in the -// package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindStringSubmatch(s string) []string { - a := re.doExecute(newInputString(s), 0, re.prog.NumCap) - if a == nil { - return nil - } - ret := make([]string, len(a)/2) - for i := range ret { - if a[2*i] >= 0 { - ret[i] = s[a[2*i]:a[2*i+1]] - } - } - return ret -} - -// FindStringSubmatchIndex returns a slice holding the index pairs -// identifying the leftmost match of the regular expression in s and the -// matches, if any, of its subexpressions, as defined by the 'Submatch' and -// 'Index' descriptions in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindStringSubmatchIndex(s string) []int { - return re.doExecute(newInputString(s), 0, re.prog.NumCap) -} - -// FindReaderSubmatchIndex returns a slice holding the index pairs -// identifying the leftmost match of the regular expression of text read by -// the RuneReader, and the matches, if any, of its subexpressions, as defined -// by the 'Submatch' and 'Index' descriptions in the package comment. A -// return value of nil indicates no match. -func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { - return re.doExecute(newInputReader(r), 0, re.prog.NumCap) -} - -const startSize = 10 // The size at which to start a slice in the 'All' routines. - -// FindAll is the 'All' version of Find; it returns a slice of all successive -// matches of the expression, as defined by the 'All' description in the -// package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAll(b []byte, n int) [][]byte { - if n < 0 { - n = len(b) + 1 - } - result := make([][]byte, 0, startSize) - re.allMatches("", b, n, func(match []int) { - result = append(result, b[match[0]:match[1]]) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all -// successive matches of the expression, as defined by the 'All' description -// in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { - if n < 0 { - n = len(b) + 1 - } - result := make([][]int, 0, startSize) - re.allMatches("", b, n, func(match []int) { - result = append(result, match[0:2]) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllString is the 'All' version of FindString; it returns a slice of all -// successive matches of the expression, as defined by the 'All' description -// in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllString(s string, n int) []string { - if n < 0 { - n = len(s) + 1 - } - result := make([]string, 0, startSize) - re.allMatches(s, nil, n, func(match []int) { - result = append(result, s[match[0]:match[1]]) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a -// slice of all successive matches of the expression, as defined by the 'All' -// description in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { - if n < 0 { - n = len(s) + 1 - } - result := make([][]int, 0, startSize) - re.allMatches(s, nil, n, func(match []int) { - result = append(result, match[0:2]) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice -// of all successive matches of the expression, as defined by the 'All' -// description in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { - if n < 0 { - n = len(b) + 1 - } - result := make([][][]byte, 0, startSize) - re.allMatches("", b, n, func(match []int) { - slice := make([][]byte, len(match)/2) - for j := range slice { - if match[2*j] >= 0 { - slice[j] = b[match[2*j]:match[2*j+1]] - } - } - result = append(result, slice) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns -// a slice of all successive matches of the expression, as defined by the -// 'All' description in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { - if n < 0 { - n = len(b) + 1 - } - result := make([][]int, 0, startSize) - re.allMatches("", b, n, func(match []int) { - result = append(result, match) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it -// returns a slice of all successive matches of the expression, as defined by -// the 'All' description in the package comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { - if n < 0 { - n = len(s) + 1 - } - result := make([][]string, 0, startSize) - re.allMatches(s, nil, n, func(match []int) { - slice := make([]string, len(match)/2) - for j := range slice { - if match[2*j] >= 0 { - slice[j] = s[match[2*j]:match[2*j+1]] - } - } - result = append(result, slice) - }) - if len(result) == 0 { - return nil - } - return result -} - -// FindAllStringSubmatchIndex is the 'All' version of -// FindStringSubmatchIndex; it returns a slice of all successive matches of -// the expression, as defined by the 'All' description in the package -// comment. -// A return value of nil indicates no match. -func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { - if n < 0 { - n = len(s) + 1 - } - result := make([][]int, 0, startSize) - re.allMatches(s, nil, n, func(match []int) { - result = append(result, match) - }) - if len(result) == 0 { - return nil - } - return result -} diff --git a/libgo/go/exp/regexp/syntax/compile.go b/libgo/go/exp/regexp/syntax/compile.go deleted file mode 100644 index 5ea2425..0000000 --- a/libgo/go/exp/regexp/syntax/compile.go +++ /dev/null @@ -1,269 +0,0 @@ -package syntax - -import ( - "os" - "unicode" -) - -// A patchList is a list of instruction pointers that need to be filled in (patched). -// Because the pointers haven't been filled in yet, we can reuse their storage -// to hold the list. It's kind of sleazy, but works well in practice. -// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. -// -// These aren't really pointers: they're integers, so we can reinterpret them -// this way without using package unsafe. A value l denotes -// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). -// l == 0 denotes the empty list, okay because we start every program -// with a fail instruction, so we'll never want to point at its output link. -type patchList uint32 - -func (l patchList) next(p *Prog) patchList { - i := &p.Inst[l>>1] - if l&1 == 0 { - return patchList(i.Out) - } - return patchList(i.Arg) -} - -func (l patchList) patch(p *Prog, val uint32) { - for l != 0 { - i := &p.Inst[l>>1] - if l&1 == 0 { - l = patchList(i.Out) - i.Out = val - } else { - l = patchList(i.Arg) - i.Arg = val - } - } -} - -func (l1 patchList) append(p *Prog, l2 patchList) patchList { - if l1 == 0 { - return l2 - } - if l2 == 0 { - return l1 - } - - last := l1 - for { - next := last.next(p) - if next == 0 { - break - } - last = next - } - - i := &p.Inst[last>>1] - if last&1 == 0 { - i.Out = uint32(l2) - } else { - i.Arg = uint32(l2) - } - return l1 -} - -// A frag represents a compiled program fragment. -type frag struct { - i uint32 // index of first instruction - out patchList // where to record end instruction -} - -type compiler struct { - p *Prog -} - -// Compile compiles the regexp into a program to be executed. -func Compile(re *Regexp) (*Prog, os.Error) { - var c compiler - c.init() - f := c.compile(re) - f.out.patch(c.p, c.inst(InstMatch).i) - c.p.Start = int(f.i) - return c.p, nil -} - -func (c *compiler) init() { - c.p = new(Prog) - c.p.NumCap = 2 // implicit ( and ) for whole match $0 - c.inst(InstFail) -} - -var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune} -var anyRune = []int{0, unicode.MaxRune} - -func (c *compiler) compile(re *Regexp) frag { - switch re.Op { - case OpNoMatch: - return c.fail() - case OpEmptyMatch: - return c.nop() - case OpLiteral: - if len(re.Rune) == 0 { - return c.nop() - } - var f frag - for j := range re.Rune { - f1 := c.rune(re.Rune[j : j+1]) - if j == 0 { - f = f1 - } else { - f = c.cat(f, f1) - } - } - return f - case OpCharClass: - return c.rune(re.Rune) - case OpAnyCharNotNL: - return c.rune(anyRuneNotNL) - case OpAnyChar: - return c.rune(anyRune) - case OpBeginLine: - return c.empty(EmptyBeginLine) - case OpEndLine: - return c.empty(EmptyEndLine) - case OpBeginText: - return c.empty(EmptyBeginText) - case OpEndText: - return c.empty(EmptyEndText) - case OpWordBoundary: - return c.empty(EmptyWordBoundary) - case OpNoWordBoundary: - return c.empty(EmptyNoWordBoundary) - case OpCapture: - bra := c.cap(uint32(re.Cap << 1)) - sub := c.compile(re.Sub[0]) - ket := c.cap(uint32(re.Cap<<1 | 1)) - return c.cat(c.cat(bra, sub), ket) - case OpStar: - return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) - case OpPlus: - return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) - case OpQuest: - return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) - case OpConcat: - if len(re.Sub) == 0 { - return c.nop() - } - var f frag - for i, sub := range re.Sub { - if i == 0 { - f = c.compile(sub) - } else { - f = c.cat(f, c.compile(sub)) - } - } - return f - case OpAlternate: - var f frag - for _, sub := range re.Sub { - f = c.alt(f, c.compile(sub)) - } - return f - } - panic("regexp: unhandled case in compile") -} - -func (c *compiler) inst(op InstOp) frag { - // TODO: impose length limit - f := frag{i: uint32(len(c.p.Inst))} - c.p.Inst = append(c.p.Inst, Inst{Op: op}) - return f -} - -func (c *compiler) nop() frag { - f := c.inst(InstNop) - f.out = patchList(f.i << 1) - return f -} - -func (c *compiler) fail() frag { - return frag{} -} - -func (c *compiler) cap(arg uint32) frag { - f := c.inst(InstCapture) - f.out = patchList(f.i << 1) - c.p.Inst[f.i].Arg = arg - - if c.p.NumCap < int(arg)+1 { - c.p.NumCap = int(arg) + 1 - } - return f -} - -func (c *compiler) cat(f1, f2 frag) frag { - // concat of failure is failure - if f1.i == 0 || f2.i == 0 { - return frag{} - } - - // TODO: elide nop - - f1.out.patch(c.p, f2.i) - return frag{f1.i, f2.out} -} - -func (c *compiler) alt(f1, f2 frag) frag { - // alt of failure is other - if f1.i == 0 { - return f2 - } - if f2.i == 0 { - return f1 - } - - f := c.inst(InstAlt) - i := &c.p.Inst[f.i] - i.Out = f1.i - i.Arg = f2.i - f.out = f1.out.append(c.p, f2.out) - return f -} - -func (c *compiler) quest(f1 frag, nongreedy bool) frag { - f := c.inst(InstAlt) - i := &c.p.Inst[f.i] - if nongreedy { - i.Arg = f1.i - f.out = patchList(f.i << 1) - } else { - i.Out = f1.i - f.out = patchList(f.i<<1 | 1) - } - f.out = f.out.append(c.p, f1.out) - return f -} - -func (c *compiler) star(f1 frag, nongreedy bool) frag { - f := c.inst(InstAlt) - i := &c.p.Inst[f.i] - if nongreedy { - i.Arg = f1.i - f.out = patchList(f.i << 1) - } else { - i.Out = f1.i - f.out = patchList(f.i<<1 | 1) - } - f1.out.patch(c.p, f.i) - return f -} - -func (c *compiler) plus(f1 frag, nongreedy bool) frag { - return frag{f1.i, c.star(f1, nongreedy).out} -} - -func (c *compiler) empty(op EmptyOp) frag { - f := c.inst(InstEmptyWidth) - c.p.Inst[f.i].Arg = uint32(op) - f.out = patchList(f.i << 1) - return f -} - -func (c *compiler) rune(rune []int) frag { - f := c.inst(InstRune) - c.p.Inst[f.i].Rune = rune - f.out = patchList(f.i << 1) - return f -} diff --git a/libgo/go/exp/regexp/syntax/parse.go b/libgo/go/exp/regexp/syntax/parse.go deleted file mode 100644 index 4eed182..0000000 --- a/libgo/go/exp/regexp/syntax/parse.go +++ /dev/null @@ -1,1797 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syntax - -import ( - "os" - "sort" - "strings" - "unicode" - "utf8" -) - -// An Error describes a failure to parse a regular expression -// and gives the offending expression. -type Error struct { - Code ErrorCode - Expr string -} - -func (e *Error) String() string { - return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" -} - -// An ErrorCode describes a failure to parse a regular expression. -type ErrorCode string - -const ( - // Unexpected error - ErrInternalError ErrorCode = "regexp/syntax: internal error" - - // Parse errors - ErrInvalidCharClass ErrorCode = "invalid character class" - ErrInvalidCharRange ErrorCode = "invalid character class range" - ErrInvalidEscape ErrorCode = "invalid escape sequence" - ErrInvalidNamedCapture ErrorCode = "invalid named capture" - ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" - ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" - ErrInvalidRepeatSize ErrorCode = "invalid repeat count" - ErrInvalidUTF8 ErrorCode = "invalid UTF-8" - ErrMissingBracket ErrorCode = "missing closing ]" - ErrMissingParen ErrorCode = "missing closing )" - ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" - ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" -) - -func (e ErrorCode) String() string { - return string(e) -} - -// Flags control the behavior of the parser and record information about regexp context. -type Flags uint16 - -const ( - FoldCase Flags = 1 << iota // case-insensitive match - Literal // treat pattern as literal string - ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline - DotNL // allow . to match newline - OneLine // treat ^ and $ as only matching at beginning and end of text - NonGreedy // make repetition operators default to non-greedy - PerlX // allow Perl extensions - UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation - WasDollar // regexp OpEndText was $, not \z - Simple // regexp contains no counted repetition - - MatchNL = ClassNL | DotNL - - Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible - POSIX Flags = 0 // POSIX syntax -) - -// Pseudo-ops for parsing stack. -const ( - opLeftParen = opPseudo + iota - opVerticalBar -) - -type parser struct { - flags Flags // parse mode flags - stack []*Regexp // stack of parsed expressions - free *Regexp - numCap int // number of capturing groups seen - wholeRegexp string - tmpClass []int // temporary char class work space -} - -func (p *parser) newRegexp(op Op) *Regexp { - re := p.free - if re != nil { - p.free = re.Sub0[0] - *re = Regexp{} - } else { - re = new(Regexp) - } - re.Op = op - return re -} - -func (p *parser) reuse(re *Regexp) { - re.Sub0[0] = p.free - p.free = re -} - -// Parse stack manipulation. - -// push pushes the regexp re onto the parse stack and returns the regexp. -func (p *parser) push(re *Regexp) *Regexp { - if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { - // Single rune. - if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { - return nil - } - re.Op = OpLiteral - re.Rune = re.Rune[:1] - re.Flags = p.flags &^ FoldCase - } else if re.Op == OpCharClass && len(re.Rune) == 4 && - re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && - unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && - unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || - re.Op == OpCharClass && len(re.Rune) == 2 && - re.Rune[0]+1 == re.Rune[1] && - unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && - unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { - // Case-insensitive rune like [Aa] or [Δδ]. - if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { - return nil - } - - // Rewrite as (case-insensitive) literal. - re.Op = OpLiteral - re.Rune = re.Rune[:1] - re.Flags = p.flags | FoldCase - } else { - // Incremental concatenation. - p.maybeConcat(-1, 0) - } - - p.stack = append(p.stack, re) - return re -} - -// maybeConcat implements incremental concatenation -// of literal runes into string nodes. The parser calls this -// before each push, so only the top fragment of the stack -// might need processing. Since this is called before a push, -// the topmost literal is no longer subject to operators like * -// (Otherwise ab* would turn into (ab)*.) -// If r >= 0 and there's a node left over, maybeConcat uses it -// to push r with the given flags. -// maybeConcat reports whether r was pushed. -func (p *parser) maybeConcat(r int, flags Flags) bool { - n := len(p.stack) - if n < 2 { - return false - } - - re1 := p.stack[n-1] - re2 := p.stack[n-2] - if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { - return false - } - - // Push re1 into re2. - re2.Rune = append(re2.Rune, re1.Rune...) - - // Reuse re1 if possible. - if r >= 0 { - re1.Rune = re1.Rune0[:1] - re1.Rune[0] = r - re1.Flags = flags - return true - } - - p.stack = p.stack[:n-1] - p.reuse(re1) - return false // did not push r -} - -// newLiteral returns a new OpLiteral Regexp with the given flags -func (p *parser) newLiteral(r int, flags Flags) *Regexp { - re := p.newRegexp(OpLiteral) - re.Flags = flags - re.Rune0[0] = r - re.Rune = re.Rune0[:1] - return re -} - -// literal pushes a literal regexp for the rune r on the stack -// and returns that regexp. -func (p *parser) literal(r int) { - p.push(p.newLiteral(r, p.flags)) -} - -// op pushes a regexp with the given op onto the stack -// and returns that regexp. -func (p *parser) op(op Op) *Regexp { - re := p.newRegexp(op) - re.Flags = p.flags - return p.push(re) -} - -// repeat replaces the top stack element with itself repeated -// according to op. -func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (string, os.Error) { - flags := p.flags - if p.flags&PerlX != 0 { - if len(t) > 0 && t[0] == '?' { - t = t[1:] - flags ^= NonGreedy - } - if lastRepeat != "" { - // In Perl it is not allowed to stack repetition operators: - // a** is a syntax error, not a doubled star, and a++ means - // something else entirely, which we don't support! - return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]} - } - } - n := len(p.stack) - if n == 0 { - return "", &Error{ErrMissingRepeatArgument, opstr} - } - sub := p.stack[n-1] - re := p.newRegexp(op) - re.Min = min - re.Max = max - re.Flags = flags - re.Sub = re.Sub0[:1] - re.Sub[0] = sub - p.stack[n-1] = re - return t, nil -} - -// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. -func (p *parser) concat() *Regexp { - p.maybeConcat(-1, 0) - - // Scan down to find pseudo-operator | or (. - i := len(p.stack) - for i > 0 && p.stack[i-1].Op < opPseudo { - i-- - } - subs := p.stack[i:] - p.stack = p.stack[:i] - - // Empty concatenation is special case. - if len(subs) == 0 { - return p.push(p.newRegexp(OpEmptyMatch)) - } - - return p.push(p.collapse(subs, OpConcat)) -} - -// alternate replaces the top of the stack (above the topmost '(') with its alternation. -func (p *parser) alternate() *Regexp { - // Scan down to find pseudo-operator (. - // There are no | above (. - i := len(p.stack) - for i > 0 && p.stack[i-1].Op < opPseudo { - i-- - } - subs := p.stack[i:] - p.stack = p.stack[:i] - - // Make sure top class is clean. - // All the others already are (see swapVerticalBar). - if len(subs) > 0 { - cleanAlt(subs[len(subs)-1]) - } - - // Empty alternate is special case - // (shouldn't happen but easy to handle). - if len(subs) == 0 { - return p.push(p.newRegexp(OpNoMatch)) - } - - return p.push(p.collapse(subs, OpAlternate)) -} - -// cleanAlt cleans re for eventual inclusion in an alternation. -func cleanAlt(re *Regexp) { - switch re.Op { - case OpCharClass: - re.Rune = cleanClass(&re.Rune) - if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { - re.Rune = nil - re.Op = OpAnyChar - return - } - if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { - re.Rune = nil - re.Op = OpAnyCharNotNL - return - } - if cap(re.Rune)-len(re.Rune) > 100 { - // re.Rune will not grow any more. - // Make a copy or inline to reclaim storage. - re.Rune = append(re.Rune0[:0], re.Rune...) - } - } -} - -// collapse returns the result of applying op to sub. -// If sub contains op nodes, they all get hoisted up -// so that there is never a concat of a concat or an -// alternate of an alternate. -func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { - if len(subs) == 1 { - return subs[0] - } - re := p.newRegexp(op) - re.Sub = re.Sub0[:0] - for _, sub := range subs { - if sub.Op == op { - re.Sub = append(re.Sub, sub.Sub...) - p.reuse(sub) - } else { - re.Sub = append(re.Sub, sub) - } - } - if op == OpAlternate { - re.Sub = p.factor(re.Sub, re.Flags) - if len(re.Sub) == 1 { - old := re - re = re.Sub[0] - p.reuse(old) - } - } - return re -} - -// factor factors common prefixes from the alternation list sub. -// It returns a replacement list that reuses the same storage and -// frees (passes to p.reuse) any removed *Regexps. -// -// For example, -// ABC|ABD|AEF|BCX|BCY -// simplifies by literal prefix extraction to -// A(B(C|D)|EF)|BC(X|Y) -// which simplifies by character class introduction to -// A(B[CD]|EF)|BC[XY] -// -func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { - if len(sub) < 2 { - return sub - } - - // Round 1: Factor out common literal prefixes. - var str []int - var strflags Flags - start := 0 - out := sub[:0] - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin - // with str as modified by strflags. - var istr []int - var iflags Flags - if i < len(sub) { - istr, iflags = p.leadingString(sub[i]) - if iflags == strflags { - same := 0 - for same < len(str) && same < len(istr) && str[same] == istr[same] { - same++ - } - if same > 0 { - // Matches at least one rune in current range. - // Keep going around. - str = str[:same] - continue - } - } - } - - // Found end of a run with common leading literal string: - // sub[start:i] all begin with str[0:len(str)], but sub[i] - // does not even begin with str[0]. - // - // Factor out common string and append factored expression to out. - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - // Just one: don't bother factoring. - out = append(out, sub[start]) - } else { - // Construct factored form: prefix(suffix1|suffix2|...) - prefix := p.newRegexp(OpLiteral) - prefix.Flags = strflags - prefix.Rune = append(prefix.Rune[:0], str...) - - for j := start; j < i; j++ { - sub[j] = p.removeLeadingString(sub[j], len(str)) - } - suffix := p.collapse(sub[start:i], OpAlternate) // recurse - - re := p.newRegexp(OpConcat) - re.Sub = append(re.Sub[:0], prefix, suffix) - out = append(out, re) - } - - // Prepare for next iteration. - start = i - str = istr - strflags = iflags - } - sub = out - - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. - start = 0 - out = sub[:0] - var first *Regexp - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin - // with str as modified by strflags. - var ifirst *Regexp - if i < len(sub) { - ifirst = p.leadingRegexp(sub[i]) - if first != nil && first.Equal(ifirst) { - continue - } - } - - // Found end of a run with common leading regexp: - // sub[start:i] all begin with first but sub[i] does not. - // - // Factor out common regexp and append factored expression to out. - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - // Just one: don't bother factoring. - out = append(out, sub[start]) - } else { - // Construct factored form: prefix(suffix1|suffix2|...) - prefix := first - - for j := start; j < i; j++ { - reuse := j != start // prefix came from sub[start] - sub[j] = p.removeLeadingRegexp(sub[j], reuse) - } - suffix := p.collapse(sub[start:i], OpAlternate) // recurse - - re := p.newRegexp(OpConcat) - re.Sub = append(re.Sub[:0], prefix, suffix) - out = append(out, re) - } - - // Prepare for next iteration. - start = i - first = ifirst - } - sub = out - - // Round 3: Collapse runs of single literals into character classes. - start = 0 - out = sub[:0] - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that are either - // literal runes or character classes. - if i < len(sub) && isCharClass(sub[i]) { - continue - } - - // sub[i] is not a char or char class; - // emit char class for sub[start:i]... - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - out = append(out, sub[start]) - } else { - // Make new char class. - // Start with most complex regexp in sub[start]. - max := start - for j := start + 1; j < i; j++ { - if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { - max = j - } - } - sub[start], sub[max] = sub[max], sub[start] - - for j := start + 1; j < i; j++ { - mergeCharClass(sub[start], sub[j]) - p.reuse(sub[j]) - } - cleanAlt(sub[start]) - out = append(out, sub[start]) - } - - // ... and then emit sub[i]. - if i < len(sub) { - out = append(out, sub[i]) - } - start = i + 1 - } - sub = out - - // Round 4: Collapse runs of empty matches into a single empty match. - start = 0 - out = sub[:0] - for i := range sub { - if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { - continue - } - out = append(out, sub[i]) - } - sub = out - - return sub -} - -// leadingString returns the leading literal string that re begins with. -// The string refers to storage in re or its children. -func (p *parser) leadingString(re *Regexp) ([]int, Flags) { - if re.Op == OpConcat && len(re.Sub) > 0 { - re = re.Sub[0] - } - if re.Op != OpLiteral { - return nil, 0 - } - return re.Rune, re.Flags & FoldCase -} - -// removeLeadingString removes the first n leading runes -// from the beginning of re. It returns the replacement for re. -func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { - if re.Op == OpConcat && len(re.Sub) > 0 { - // Removing a leading string in a concatenation - // might simplify the concatenation. - sub := re.Sub[0] - sub = p.removeLeadingString(sub, n) - re.Sub[0] = sub - if sub.Op == OpEmptyMatch { - p.reuse(sub) - switch len(re.Sub) { - case 0, 1: - // Impossible but handle. - re.Op = OpEmptyMatch - re.Sub = nil - case 2: - old := re - re = re.Sub[1] - p.reuse(old) - default: - copy(re.Sub, re.Sub[1:]) - re.Sub = re.Sub[:len(re.Sub)-1] - } - } - return re - } - - if re.Op == OpLiteral { - re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] - if len(re.Rune) == 0 { - re.Op = OpEmptyMatch - } - } - return re -} - -// leadingRegexp returns the leading regexp that re begins with. -// The regexp refers to storage in re or its children. -func (p *parser) leadingRegexp(re *Regexp) *Regexp { - if re.Op == OpEmptyMatch { - return nil - } - if re.Op == OpConcat && len(re.Sub) > 0 { - sub := re.Sub[0] - if sub.Op == OpEmptyMatch { - return nil - } - return sub - } - return re -} - -// removeLeadingRegexp removes the leading regexp in re. -// It returns the replacement for re. -// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. -func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { - if re.Op == OpConcat && len(re.Sub) > 0 { - if reuse { - p.reuse(re.Sub[0]) - } - re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] - switch len(re.Sub) { - case 0: - re.Op = OpEmptyMatch - re.Sub = nil - case 1: - old := re - re = re.Sub[0] - p.reuse(old) - } - return re - } - re.Op = OpEmptyMatch - return re -} - -func literalRegexp(s string, flags Flags) *Regexp { - re := &Regexp{Op: OpLiteral} - re.Flags = flags - re.Rune = re.Rune0[:0] // use local storage for small strings - for _, c := range s { - if len(re.Rune) >= cap(re.Rune) { - // string is too long to fit in Rune0. let Go handle it - re.Rune = []int(s) - break - } - re.Rune = append(re.Rune, c) - } - return re -} - -// Parsing. - -func Parse(s string, flags Flags) (*Regexp, os.Error) { - if flags&Literal != 0 { - // Trivial parser for literal string. - if err := checkUTF8(s); err != nil { - return nil, err - } - return literalRegexp(s, flags), nil - } - - // Otherwise, must do real work. - var ( - p parser - err os.Error - c int - op Op - lastRepeat string - min, max int - ) - p.flags = flags - p.wholeRegexp = s - t := s - for t != "" { - repeat := "" - BigSwitch: - switch t[0] { - default: - if c, t, err = nextRune(t); err != nil { - return nil, err - } - p.literal(c) - - case '(': - if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { - // Flag changes and non-capturing groups. - if t, err = p.parsePerlFlags(t); err != nil { - return nil, err - } - break - } - p.numCap++ - p.op(opLeftParen).Cap = p.numCap - t = t[1:] - case '|': - if err = p.parseVerticalBar(); err != nil { - return nil, err - } - t = t[1:] - case ')': - if err = p.parseRightParen(); err != nil { - return nil, err - } - t = t[1:] - case '^': - if p.flags&OneLine != 0 { - p.op(OpBeginText) - } else { - p.op(OpBeginLine) - } - t = t[1:] - case '$': - if p.flags&OneLine != 0 { - p.op(OpEndText).Flags |= WasDollar - } else { - p.op(OpEndLine) - } - t = t[1:] - case '.': - if p.flags&DotNL != 0 { - p.op(OpAnyChar) - } else { - p.op(OpAnyCharNotNL) - } - t = t[1:] - case '[': - if t, err = p.parseClass(t); err != nil { - return nil, err - } - case '*', '+', '?': - switch t[0] { - case '*': - op = OpStar - case '+': - op = OpPlus - case '?': - op = OpQuest - } - if t, err = p.repeat(op, min, max, t[:1], t[1:], lastRepeat); err != nil { - return nil, err - } - case '{': - op = OpRepeat - min, max, tt, ok := p.parseRepeat(t) - if !ok { - // If the repeat cannot be parsed, { is a literal. - p.literal('{') - t = t[1:] - break - } - if t, err = p.repeat(op, min, max, t[:len(t)-len(tt)], tt, lastRepeat); err != nil { - return nil, err - } - case '\\': - if p.flags&PerlX != 0 && len(t) >= 2 { - switch t[1] { - case 'A': - p.op(OpBeginText) - t = t[2:] - break BigSwitch - case 'b': - p.op(OpWordBoundary) - t = t[2:] - break BigSwitch - case 'B': - p.op(OpNoWordBoundary) - t = t[2:] - break BigSwitch - case 'C': - // any byte; not supported - return nil, &Error{ErrInvalidEscape, t[:2]} - case 'Q': - // \Q ... \E: the ... is always literals - var lit string - if i := strings.Index(t, `\E`); i < 0 { - lit = t[2:] - t = "" - } else { - lit = t[2:i] - t = t[i+2:] - } - p.push(literalRegexp(lit, p.flags)) - break BigSwitch - case 'z': - p.op(OpEndText) - t = t[2:] - break BigSwitch - } - } - - re := p.newRegexp(OpCharClass) - re.Flags = p.flags - - // Look for Unicode character group like \p{Han} - if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { - r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) - if err != nil { - return nil, err - } - if r != nil { - re.Rune = r - t = rest - p.push(re) - break BigSwitch - } - } - - // Perl character class escape. - if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { - re.Rune = r - t = rest - p.push(re) - break BigSwitch - } - p.reuse(re) - - // Ordinary single-character escape. - if c, t, err = p.parseEscape(t); err != nil { - return nil, err - } - p.literal(c) - } - lastRepeat = repeat - } - - p.concat() - if p.swapVerticalBar() { - // pop vertical bar - p.stack = p.stack[:len(p.stack)-1] - } - p.alternate() - - n := len(p.stack) - if n != 1 { - return nil, &Error{ErrMissingParen, s} - } - return p.stack[0], nil -} - -// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. -// If s is not of that form, it returns ok == false. -func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { - if s == "" || s[0] != '{' { - return - } - s = s[1:] - if min, s, ok = p.parseInt(s); !ok { - return - } - if s == "" { - return - } - if s[0] != ',' { - max = min - } else { - s = s[1:] - if s == "" { - return - } - if s[0] == '}' { - max = -1 - } else if max, s, ok = p.parseInt(s); !ok { - return - } - } - if s == "" || s[0] != '}' { - return - } - rest = s[1:] - ok = true - return -} - -// parsePerlFlags parses a Perl flag setting or non-capturing group or both, -// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. -// The caller must have ensured that s begins with "(?". -func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) { - t := s - - // Check for named captures, first introduced in Python's regexp library. - // As usual, there are three slightly different syntaxes: - // - // (?P<name>expr) the original, introduced by Python - // (?<name>expr) the .NET alteration, adopted by Perl 5.10 - // (?'name'expr) another .NET alteration, adopted by Perl 5.10 - // - // Perl 5.10 gave in and implemented the Python version too, - // but they claim that the last two are the preferred forms. - // PCRE and languages based on it (specifically, PHP and Ruby) - // support all three as well. EcmaScript 4 uses only the Python form. - // - // In both the open source world (via Code Search) and the - // Google source tree, (?P<expr>name) is the dominant form, - // so that's the one we implement. One is enough. - if len(t) > 4 && t[2] == 'P' && t[3] == '<' { - // Pull out name. - end := strings.IndexRune(t, '>') - if end < 0 { - if err = checkUTF8(t); err != nil { - return "", err - } - return "", &Error{ErrInvalidNamedCapture, s} - } - - capture := t[:end+1] // "(?P<name>" - name := t[4:end] // "name" - if err = checkUTF8(name); err != nil { - return "", err - } - if !isValidCaptureName(name) { - return "", &Error{ErrInvalidNamedCapture, capture} - } - - // Like ordinary capture, but named. - p.numCap++ - re := p.op(opLeftParen) - re.Cap = p.numCap - re.Name = name - return t[end+1:], nil - } - - // Non-capturing group. Might also twiddle Perl flags. - var c int - t = t[2:] // skip (? - flags := p.flags - sign := +1 - sawFlag := false -Loop: - for t != "" { - if c, t, err = nextRune(t); err != nil { - return "", err - } - switch c { - default: - break Loop - - // Flags. - case 'i': - flags |= FoldCase - sawFlag = true - case 'm': - flags &^= OneLine - sawFlag = true - case 's': - flags |= DotNL - sawFlag = true - case 'U': - flags |= NonGreedy - sawFlag = true - - // Switch to negation. - case '-': - if sign < 0 { - break Loop - } - sign = -1 - // Invert flags so that | above turn into &^ and vice versa. - // We'll invert flags again before using it below. - flags = ^flags - sawFlag = false - - // End of flags, starting group or not. - case ':', ')': - if sign < 0 { - if !sawFlag { - break Loop - } - flags = ^flags - } - if c == ':' { - // Open new group - p.op(opLeftParen) - } - p.flags = flags - return t, nil - } - } - - return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} -} - -// isValidCaptureName reports whether name -// is a valid capture name: [A-Za-z0-9_]+. -// PCRE limits names to 32 bytes. -// Python rejects names starting with digits. -// We don't enforce either of those. -func isValidCaptureName(name string) bool { - if name == "" { - return false - } - for _, c := range name { - if c != '_' && !isalnum(c) { - return false - } - } - return true -} - -// parseInt parses a decimal integer. -func (p *parser) parseInt(s string) (n int, rest string, ok bool) { - if s == "" || s[0] < '0' || '9' < s[0] { - return - } - // Disallow leading zeros. - if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { - return - } - for s != "" && '0' <= s[0] && s[0] <= '9' { - // Avoid overflow. - if n >= 1e8 { - return - } - n = n*10 + int(s[0]) - '0' - s = s[1:] - } - rest = s - ok = true - return -} - -// can this be represented as a character class? -// single-rune literal string, char class, ., and .|\n. -func isCharClass(re *Regexp) bool { - return re.Op == OpLiteral && len(re.Rune) == 1 || - re.Op == OpCharClass || - re.Op == OpAnyCharNotNL || - re.Op == OpAnyChar -} - -// does re match r? -func matchRune(re *Regexp, r int) bool { - switch re.Op { - case OpLiteral: - return len(re.Rune) == 1 && re.Rune[0] == r - case OpCharClass: - for i := 0; i < len(re.Rune); i += 2 { - if re.Rune[i] <= r && r <= re.Rune[i+1] { - return true - } - } - return false - case OpAnyCharNotNL: - return r != '\n' - case OpAnyChar: - return true - } - return false -} - -// parseVerticalBar handles a | in the input. -func (p *parser) parseVerticalBar() os.Error { - p.concat() - - // The concatenation we just parsed is on top of the stack. - // If it sits above an opVerticalBar, swap it below - // (things below an opVerticalBar become an alternation). - // Otherwise, push a new vertical bar. - if !p.swapVerticalBar() { - p.op(opVerticalBar) - } - - return nil -} - -// mergeCharClass makes dst = dst|src. -// The caller must ensure that dst.Op >= src.Op, -// to reduce the amount of copying. -func mergeCharClass(dst, src *Regexp) { - switch dst.Op { - case OpAnyChar: - // src doesn't add anything. - case OpAnyCharNotNL: - // src might add \n - if matchRune(src, '\n') { - dst.Op = OpAnyChar - } - case OpCharClass: - // src is simpler, so either literal or char class - if src.Op == OpLiteral { - dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) - } else { - dst.Rune = appendClass(dst.Rune, src.Rune) - } - case OpLiteral: - // both literal - if src.Rune[0] == dst.Rune[0] { - break - } - dst.Op = OpCharClass - dst.Rune = append(dst.Rune, dst.Rune[0]) - dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) - } -} - -// If the top of the stack is an element followed by an opVerticalBar -// swapVerticalBar swaps the two and returns true. -// Otherwise it returns false. -func (p *parser) swapVerticalBar() bool { - // If above and below vertical bar are literal or char class, - // can merge into a single char class. - n := len(p.stack) - if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { - re1 := p.stack[n-1] - re3 := p.stack[n-3] - // Make re3 the more complex of the two. - if re1.Op > re3.Op { - re1, re3 = re3, re1 - p.stack[n-3] = re3 - } - mergeCharClass(re3, re1) - p.reuse(re1) - p.stack = p.stack[:n-1] - return true - } - - if n >= 2 { - re1 := p.stack[n-1] - re2 := p.stack[n-2] - if re2.Op == opVerticalBar { - if n >= 3 { - // Now out of reach. - // Clean opportunistically. - cleanAlt(p.stack[n-3]) - } - p.stack[n-2] = re1 - p.stack[n-1] = re2 - return true - } - } - return false -} - -// parseRightParen handles a ) in the input. -func (p *parser) parseRightParen() os.Error { - p.concat() - if p.swapVerticalBar() { - // pop vertical bar - p.stack = p.stack[:len(p.stack)-1] - } - p.alternate() - - n := len(p.stack) - if n < 2 { - return &Error{ErrInternalError, ""} - } - re1 := p.stack[n-1] - re2 := p.stack[n-2] - p.stack = p.stack[:n-2] - if re2.Op != opLeftParen { - return &Error{ErrMissingParen, p.wholeRegexp} - } - if re2.Cap == 0 { - // Just for grouping. - p.push(re1) - } else { - re2.Op = OpCapture - re2.Sub = re2.Sub0[:1] - re2.Sub[0] = re1 - p.push(re2) - } - return nil -} - -// parseEscape parses an escape sequence at the beginning of s -// and returns the rune. -func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) { - t := s[1:] - if t == "" { - return 0, "", &Error{ErrTrailingBackslash, ""} - } - c, t, err := nextRune(t) - if err != nil { - return 0, "", err - } - -Switch: - switch c { - default: - if c < utf8.RuneSelf && !isalnum(c) { - // Escaped non-word characters are always themselves. - // PCRE is not quite so rigorous: it accepts things like - // \q, but we don't. We once rejected \_, but too many - // programs and people insist on using it, so allow \_. - return c, t, nil - } - - // Octal escapes. - case '1', '2', '3', '4', '5', '6', '7': - // Single non-zero digit is a backreference; not supported - if t == "" || t[0] < '0' || t[0] > '7' { - break - } - fallthrough - case '0': - // Consume up to three octal digits; already have one. - r = c - '0' - for i := 1; i < 3; i++ { - if t == "" || t[0] < '0' || t[0] > '7' { - break - } - r = r*8 + int(t[0]) - '0' - t = t[1:] - } - return r, t, nil - - // Hexadecimal escapes. - case 'x': - if t == "" { - break - } - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - if c == '{' { - // Any number of digits in braces. - // Perl accepts any text at all; it ignores all text - // after the first non-hex digit. We require only hex digits, - // and at least one. - nhex := 0 - r = 0 - for { - if t == "" { - break Switch - } - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - if c == '}' { - break - } - v := unhex(c) - if v < 0 { - break Switch - } - r = r*16 + v - if r > unicode.MaxRune { - break Switch - } - nhex++ - } - if nhex == 0 { - break Switch - } - return r, t, nil - } - - // Easy case: two hex digits. - x := unhex(c) - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - y := unhex(c) - if x < 0 || y < 0 { - break - } - return x*16 + y, t, nil - - // C escapes. There is no case 'b', to avoid misparsing - // the Perl word-boundary \b as the C backspace \b - // when in POSIX mode. In Perl, /\b/ means word-boundary - // but /[\b]/ means backspace. We don't support that. - // If you want a backspace, embed a literal backspace - // character or use \x08. - case 'a': - return '\a', t, err - case 'f': - return '\f', t, err - case 'n': - return '\n', t, err - case 'r': - return '\r', t, err - case 't': - return '\t', t, err - case 'v': - return '\v', t, err - } - return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} -} - -// parseClassChar parses a character class character at the beginning of s -// and returns it. -func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) { - if s == "" { - return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} - } - - // Allow regular escape sequences even though - // many need not be escaped in this context. - if s[0] == '\\' { - return p.parseEscape(s) - } - - return nextRune(s) -} - -type charGroup struct { - sign int - class []int -} - -// parsePerlClassEscape parses a leading Perl character class escape like \d -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) { - if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { - return - } - g := perlGroup[s[0:2]] - if g.sign == 0 { - return - } - return p.appendGroup(r, g), s[2:] -} - -// parseNamedClass parses a leading POSIX named character class like [:alnum:] -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) { - if len(s) < 2 || s[0] != '[' || s[1] != ':' { - return - } - - i := strings.Index(s[2:], ":]") - if i < 0 { - return - } - i += 2 - name, s := s[0:i+2], s[i+2:] - g := posixGroup[name] - if g.sign == 0 { - return nil, "", &Error{ErrInvalidCharRange, name} - } - return p.appendGroup(r, g), s, nil -} - -func (p *parser) appendGroup(r []int, g charGroup) []int { - if p.flags&FoldCase == 0 { - if g.sign < 0 { - r = appendNegatedClass(r, g.class) - } else { - r = appendClass(r, g.class) - } - } else { - tmp := p.tmpClass[:0] - tmp = appendFoldedClass(tmp, g.class) - p.tmpClass = tmp - tmp = cleanClass(&p.tmpClass) - if g.sign < 0 { - r = appendNegatedClass(r, tmp) - } else { - r = appendClass(r, tmp) - } - } - return r -} - -// unicodeTable returns the unicode.RangeTable identified by name -// and the table of additional fold-equivalent code points. -func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { - if t := unicode.Categories[name]; t != nil { - return t, unicode.FoldCategory[name] - } - if t := unicode.Scripts[name]; t != nil { - return t, unicode.FoldScript[name] - } - return nil, nil -} - -// parseUnicodeClass parses a leading Unicode character class like \p{Han} -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) { - if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { - return - } - - // Committed to parse or return error. - sign := +1 - if s[1] == 'P' { - sign = -1 - } - t := s[2:] - c, t, err := nextRune(t) - if err != nil { - return - } - var seq, name string - if c != '{' { - // Single-letter name. - seq = s[:len(s)-len(t)] - name = seq[2:] - } else { - // Name is in braces. - end := strings.IndexRune(s, '}') - if end < 0 { - if err = checkUTF8(s); err != nil { - return - } - return nil, "", &Error{ErrInvalidCharRange, s} - } - seq, t = s[:end+1], s[end+1:] - name = s[3:end] - if err = checkUTF8(name); err != nil { - return - } - } - - // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. - if name != "" && name[0] == '^' { - sign = -sign - name = name[1:] - } - - tab, fold := unicodeTable(name) - if tab == nil { - return nil, "", &Error{ErrInvalidCharRange, seq} - } - - if p.flags&FoldCase == 0 || fold == nil { - if sign > 0 { - r = appendTable(r, tab) - } else { - r = appendNegatedTable(r, tab) - } - } else { - // Merge and clean tab and fold in a temporary buffer. - // This is necessary for the negative case and just tidy - // for the positive case. - tmp := p.tmpClass[:0] - tmp = appendTable(tmp, tab) - tmp = appendTable(tmp, fold) - p.tmpClass = tmp - tmp = cleanClass(&p.tmpClass) - if sign > 0 { - r = appendClass(r, tmp) - } else { - r = appendNegatedClass(r, tmp) - } - } - return r, t, nil -} - -// parseClass parses a character class at the beginning of s -// and pushes it onto the parse stack. -func (p *parser) parseClass(s string) (rest string, err os.Error) { - t := s[1:] // chop [ - re := p.newRegexp(OpCharClass) - re.Flags = p.flags - re.Rune = re.Rune0[:0] - - sign := +1 - if t != "" && t[0] == '^' { - sign = -1 - t = t[1:] - - // If character class does not match \n, add it here, - // so that negation later will do the right thing. - if p.flags&ClassNL == 0 { - re.Rune = append(re.Rune, '\n', '\n') - } - } - - class := re.Rune - first := true // ] and - are okay as first char in class - for t == "" || t[0] != ']' || first { - // POSIX: - is only okay unescaped as first or last in class. - // Perl: - is okay anywhere. - if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { - _, size := utf8.DecodeRuneInString(t[1:]) - return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} - } - first = false - - // Look for POSIX [:alnum:] etc. - if len(t) > 2 && t[0] == '[' && t[1] == ':' { - nclass, nt, err := p.parseNamedClass(t, class) - if err != nil { - return "", err - } - if nclass != nil { - class, t = nclass, nt - continue - } - } - - // Look for Unicode character group like \p{Han}. - nclass, nt, err := p.parseUnicodeClass(t, class) - if err != nil { - return "", err - } - if nclass != nil { - class, t = nclass, nt - continue - } - - // Look for Perl character class symbols (extension). - if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { - class, t = nclass, nt - continue - } - - // Single character or simple range. - rng := t - var lo, hi int - if lo, t, err = p.parseClassChar(t, s); err != nil { - return "", err - } - hi = lo - // [a-] means (a|-) so check for final ]. - if len(t) >= 2 && t[0] == '-' && t[1] != ']' { - t = t[1:] - if hi, t, err = p.parseClassChar(t, s); err != nil { - return "", err - } - if hi < lo { - rng = rng[:len(rng)-len(t)] - return "", &Error{Code: ErrInvalidCharRange, Expr: rng} - } - } - if p.flags&FoldCase == 0 { - class = appendRange(class, lo, hi) - } else { - class = appendFoldedRange(class, lo, hi) - } - } - t = t[1:] // chop ] - - // Use &re.Rune instead of &class to avoid allocation. - re.Rune = class - class = cleanClass(&re.Rune) - if sign < 0 { - class = negateClass(class) - } - re.Rune = class - p.push(re) - return t, nil -} - -// cleanClass sorts the ranges (pairs of elements of r), -// merges them, and eliminates duplicates. -func cleanClass(rp *[]int) []int { - - // Sort by lo increasing, hi decreasing to break ties. - sort.Sort(ranges{rp}) - - r := *rp - if len(r) < 2 { - return r - } - - // Merge abutting, overlapping. - w := 2 // write index - for i := 2; i < len(r); i += 2 { - lo, hi := r[i], r[i+1] - if lo <= r[w-1]+1 { - // merge with previous range - if hi > r[w-1] { - r[w-1] = hi - } - continue - } - // new disjoint range - r[w] = lo - r[w+1] = hi - w += 2 - } - - return r[:w] -} - -// appendRange returns the result of appending the range lo-hi to the class r. -func appendRange(r []int, lo, hi int) []int { - // Expand last range or next to last range if it overlaps or abuts. - // Checking two ranges helps when appending case-folded - // alphabets, so that one range can be expanding A-Z and the - // other expanding a-z. - n := len(r) - for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 - if n >= i { - rlo, rhi := r[n-i], r[n-i+1] - if lo <= rhi+1 && rlo <= hi+1 { - if lo < rlo { - r[n-i] = lo - } - if hi > rhi { - r[n-i+1] = hi - } - return r - } - } - } - - return append(r, lo, hi) -} - -const ( - // minimum and maximum runes involved in folding. - // checked during test. - minFold = 0x0041 - maxFold = 0x1044f -) - -// appendFoldedRange returns the result of appending the range lo-hi -// and its case folding-equivalent runes to the class r. -func appendFoldedRange(r []int, lo, hi int) []int { - // Optimizations. - if lo <= minFold && hi >= maxFold { - // Range is full: folding can't add more. - return appendRange(r, lo, hi) - } - if hi < minFold || lo > maxFold { - // Range is outside folding possibilities. - return appendRange(r, lo, hi) - } - if lo < minFold { - // [lo, minFold-1] needs no folding. - r = appendRange(r, lo, minFold-1) - lo = minFold - } - if hi > maxFold { - // [maxFold+1, hi] needs no folding. - r = appendRange(r, maxFold+1, hi) - hi = maxFold - } - - // Brute force. Depend on appendRange to coalesce ranges on the fly. - for c := lo; c <= hi; c++ { - r = appendRange(r, c, c) - f := unicode.SimpleFold(c) - for f != c { - r = appendRange(r, f, f) - f = unicode.SimpleFold(f) - } - } - return r -} - -// appendClass returns the result of appending the class x to the class r. -// It assume x is clean. -func appendClass(r []int, x []int) []int { - for i := 0; i < len(x); i += 2 { - r = appendRange(r, x[i], x[i+1]) - } - return r -} - -// appendFolded returns the result of appending the case folding of the class x to the class r. -func appendFoldedClass(r []int, x []int) []int { - for i := 0; i < len(x); i += 2 { - r = appendFoldedRange(r, x[i], x[i+1]) - } - return r -} - -// appendNegatedClass returns the result of appending the negation of the class x to the class r. -// It assumes x is clean. -func appendNegatedClass(r []int, x []int) []int { - nextLo := 0 - for i := 0; i < len(x); i += 2 { - lo, hi := x[i], x[i+1] - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - } - if nextLo <= unicode.MaxRune { - r = appendRange(r, nextLo, unicode.MaxRune) - } - return r -} - -// appendTable returns the result of appending x to the class r. -func appendTable(r []int, x *unicode.RangeTable) []int { - for _, xr := range x.R16 { - lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) - if stride == 1 { - r = appendRange(r, lo, hi) - continue - } - for c := lo; c <= hi; c += stride { - r = appendRange(r, c, c) - } - } - for _, xr := range x.R32 { - lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) - if stride == 1 { - r = appendRange(r, lo, hi) - continue - } - for c := lo; c <= hi; c += stride { - r = appendRange(r, c, c) - } - } - return r -} - -// appendNegatedTable returns the result of appending the negation of x to the class r. -func appendNegatedTable(r []int, x *unicode.RangeTable) []int { - nextLo := 0 // lo end of next class to add - for _, xr := range x.R16 { - lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) - if stride == 1 { - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - continue - } - for c := lo; c <= hi; c += stride { - if nextLo <= c-1 { - r = appendRange(r, nextLo, c-1) - } - nextLo = c + 1 - } - } - for _, xr := range x.R32 { - lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) - if stride == 1 { - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - continue - } - for c := lo; c <= hi; c += stride { - if nextLo <= c-1 { - r = appendRange(r, nextLo, c-1) - } - nextLo = c + 1 - } - } - if nextLo <= unicode.MaxRune { - r = appendRange(r, nextLo, unicode.MaxRune) - } - return r -} - -// negateClass overwrites r and returns r's negation. -// It assumes the class r is already clean. -func negateClass(r []int) []int { - nextLo := 0 // lo end of next class to add - w := 0 // write index - for i := 0; i < len(r); i += 2 { - lo, hi := r[i], r[i+1] - if nextLo <= lo-1 { - r[w] = nextLo - r[w+1] = lo - 1 - w += 2 - } - nextLo = hi + 1 - } - r = r[:w] - if nextLo <= unicode.MaxRune { - // It's possible for the negation to have one more - // range - this one - than the original class, so use append. - r = append(r, nextLo, unicode.MaxRune) - } - return r -} - -// ranges implements sort.Interface on a []rune. -// The choice of receiver type definition is strange -// but avoids an allocation since we already have -// a *[]int. -type ranges struct { - p *[]int -} - -func (ra ranges) Less(i, j int) bool { - p := *ra.p - i *= 2 - j *= 2 - return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] -} - -func (ra ranges) Len() int { - return len(*ra.p) / 2 -} - -func (ra ranges) Swap(i, j int) { - p := *ra.p - i *= 2 - j *= 2 - p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] -} - -func checkUTF8(s string) os.Error { - for s != "" { - rune, size := utf8.DecodeRuneInString(s) - if rune == utf8.RuneError && size == 1 { - return &Error{Code: ErrInvalidUTF8, Expr: s} - } - s = s[size:] - } - return nil -} - -func nextRune(s string) (c int, t string, err os.Error) { - c, size := utf8.DecodeRuneInString(s) - if c == utf8.RuneError && size == 1 { - return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} - } - return c, s[size:], nil -} - -func isalnum(c int) bool { - return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' -} - -func unhex(c int) int { - if '0' <= c && c <= '9' { - return c - '0' - } - if 'a' <= c && c <= 'f' { - return c - 'a' + 10 - } - if 'A' <= c && c <= 'F' { - return c - 'A' + 10 - } - return -1 -} diff --git a/libgo/go/exp/regexp/syntax/parse_test.go b/libgo/go/exp/regexp/syntax/parse_test.go deleted file mode 100644 index 779b9af..0000000 --- a/libgo/go/exp/regexp/syntax/parse_test.go +++ /dev/null @@ -1,350 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syntax - -import ( - "bytes" - "fmt" - "testing" - "unicode" -) - -var parseTests = []struct { - Regexp string - Dump string -}{ - // Base cases - {`a`, `lit{a}`}, - {`a.`, `cat{lit{a}dot{}}`}, - {`a.b`, `cat{lit{a}dot{}lit{b}}`}, - {`ab`, `str{ab}`}, - {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, - {`abc`, `str{abc}`}, - {`a|^`, `alt{lit{a}bol{}}`}, - {`a|b`, `cc{0x61-0x62}`}, - {`(a)`, `cap{lit{a}}`}, - {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, - {`a*`, `star{lit{a}}`}, - {`a+`, `plus{lit{a}}`}, - {`a?`, `que{lit{a}}`}, - {`a{2}`, `rep{2,2 lit{a}}`}, - {`a{2,3}`, `rep{2,3 lit{a}}`}, - {`a{2,}`, `rep{2,-1 lit{a}}`}, - {`a*?`, `nstar{lit{a}}`}, - {`a+?`, `nplus{lit{a}}`}, - {`a??`, `nque{lit{a}}`}, - {`a{2}?`, `nrep{2,2 lit{a}}`}, - {`a{2,3}?`, `nrep{2,3 lit{a}}`}, - {`a{2,}?`, `nrep{2,-1 lit{a}}`}, - {``, `emp{}`}, - {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored - {`|x|`, `alt{emp{}lit{x}emp{}}`}, - {`.`, `dot{}`}, - {`^`, `bol{}`}, - {`$`, `eol{}`}, - {`\|`, `lit{|}`}, - {`\(`, `lit{(}`}, - {`\)`, `lit{)}`}, - {`\*`, `lit{*}`}, - {`\+`, `lit{+}`}, - {`\?`, `lit{?}`}, - {`{`, `lit{{}`}, - {`}`, `lit{}}`}, - {`\.`, `lit{.}`}, - {`\^`, `lit{^}`}, - {`\$`, `lit{$}`}, - {`\\`, `lit{\}`}, - {`[ace]`, `cc{0x61 0x63 0x65}`}, - {`[abc]`, `cc{0x61-0x63}`}, - {`[a-z]`, `cc{0x61-0x7a}`}, - {`[a]`, `lit{a}`}, - {`\-`, `lit{-}`}, - {`-`, `lit{-}`}, - {`\_`, `lit{_}`}, - {`abc`, `str{abc}`}, - {`abc|def`, `alt{str{abc}str{def}}`}, - {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, - - // Posix and Perl extensions - {`[[:lower:]]`, `cc{0x61-0x7a}`}, - {`[a-z]`, `cc{0x61-0x7a}`}, - {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, - {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, - {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, - {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, - {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, - {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, - {`\d`, `cc{0x30-0x39}`}, - {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, - {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, - {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, - {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, - {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, - {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, - {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, - {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, - // { `\C`, `byte{}` }, // probably never - - // Unicode, negatives, and a double negative. - {`\p{Braille}`, `cc{0x2800-0x28ff}`}, - {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, - {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, - {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, - {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, - {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, - {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, - {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, - {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, - {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, - {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, - {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, - {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, - - // Hex, octal. - {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, - {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, - - // More interesting regular expressions. - {`a{,2}`, `str{a{,2}}`}, - {`\.\^\$\\`, `str{.^$\}`}, - {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, - {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, - {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 - {`a*{`, `cat{star{lit{a}}lit{{}}`}, - - // Test precedences - {`(?:ab)*`, `star{str{ab}}`}, - {`(ab)*`, `star{cap{str{ab}}}`}, - {`ab|cd`, `alt{str{ab}str{cd}}`}, - {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, - - // Test flattening. - {`(?:a)`, `lit{a}`}, - {`(?:ab)(?:cd)`, `str{abcd}`}, - {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, - {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, - {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, - {`a|.`, `dot{}`}, - {`.|a`, `dot{}`}, - {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, - {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, - - // Test Perl quoted literals - {`\Q+|*?{[\E`, `str{+|*?{[}`}, - {`\Q+\E+`, `plus{lit{+}}`}, - {`\Q\\E`, `lit{\}`}, - {`\Q\\\E`, `str{\\}`}, - - // Test Perl \A and \z - {`(?m)^`, `bol{}`}, - {`(?m)$`, `eol{}`}, - {`(?-m)^`, `bot{}`}, - {`(?-m)$`, `eot{}`}, - {`(?m)\A`, `bot{}`}, - {`(?m)\z`, `eot{\z}`}, - {`(?-m)\A`, `bot{}`}, - {`(?-m)\z`, `eot{\z}`}, - - // Test named captures - {`(?P<name>a)`, `cap{name:lit{a}}`}, - - // Case-folded literals - {`[Aa]`, `litfold{A}`}, - {`[\x{100}\x{101}]`, `litfold{Ā}`}, - {`[Δδ]`, `litfold{Δ}`}, - - // Strings - {`abcde`, `str{abcde}`}, - {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, - - // Factoring. - {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, - {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, -} - -const testFlags = MatchNL | PerlX | UnicodeGroups - -// Test Parse -> Dump. -func TestParseDump(t *testing.T) { - for _, tt := range parseTests { - re, err := Parse(tt.Regexp, testFlags) - if err != nil { - t.Errorf("Parse(%#q): %v", tt.Regexp, err) - continue - } - d := dump(re) - if d != tt.Dump { - t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) - } - } -} - -// dump prints a string representation of the regexp showing -// the structure explicitly. -func dump(re *Regexp) string { - var b bytes.Buffer - dumpRegexp(&b, re) - return b.String() -} - -var opNames = []string{ - OpNoMatch: "no", - OpEmptyMatch: "emp", - OpLiteral: "lit", - OpCharClass: "cc", - OpAnyCharNotNL: "dnl", - OpAnyChar: "dot", - OpBeginLine: "bol", - OpEndLine: "eol", - OpBeginText: "bot", - OpEndText: "eot", - OpWordBoundary: "wb", - OpNoWordBoundary: "nwb", - OpCapture: "cap", - OpStar: "star", - OpPlus: "plus", - OpQuest: "que", - OpRepeat: "rep", - OpConcat: "cat", - OpAlternate: "alt", -} - -// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. -// It is used during testing to distinguish between parses that might print -// the same using re's String method. -func dumpRegexp(b *bytes.Buffer, re *Regexp) { - if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { - fmt.Fprintf(b, "op%d", re.Op) - } else { - switch re.Op { - default: - b.WriteString(opNames[re.Op]) - case OpStar, OpPlus, OpQuest, OpRepeat: - if re.Flags&NonGreedy != 0 { - b.WriteByte('n') - } - b.WriteString(opNames[re.Op]) - case OpLiteral: - if len(re.Rune) > 1 { - b.WriteString("str") - } else { - b.WriteString("lit") - } - if re.Flags&FoldCase != 0 { - for _, r := range re.Rune { - if unicode.SimpleFold(r) != r { - b.WriteString("fold") - break - } - } - } - } - } - b.WriteByte('{') - switch re.Op { - case OpEndText: - if re.Flags&WasDollar == 0 { - b.WriteString(`\z`) - } - case OpLiteral: - for _, r := range re.Rune { - b.WriteRune(r) - } - case OpConcat, OpAlternate: - for _, sub := range re.Sub { - dumpRegexp(b, sub) - } - case OpStar, OpPlus, OpQuest: - dumpRegexp(b, re.Sub[0]) - case OpRepeat: - fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) - dumpRegexp(b, re.Sub[0]) - case OpCapture: - if re.Name != "" { - b.WriteString(re.Name) - b.WriteByte(':') - } - dumpRegexp(b, re.Sub[0]) - case OpCharClass: - sep := "" - for i := 0; i < len(re.Rune); i += 2 { - b.WriteString(sep) - sep = " " - lo, hi := re.Rune[i], re.Rune[i+1] - if lo == hi { - fmt.Fprintf(b, "%#x", lo) - } else { - fmt.Fprintf(b, "%#x-%#x", lo, hi) - } - } - } - b.WriteByte('}') -} - -func mkCharClass(f func(int) bool) string { - re := &Regexp{Op: OpCharClass} - lo := -1 - for i := 0; i <= unicode.MaxRune; i++ { - if f(i) { - if lo < 0 { - lo = i - } - } else { - if lo >= 0 { - re.Rune = append(re.Rune, lo, i-1) - lo = -1 - } - } - } - if lo >= 0 { - re.Rune = append(re.Rune, lo, unicode.MaxRune) - } - return dump(re) -} - -func isUpperFold(rune int) bool { - if unicode.IsUpper(rune) { - return true - } - c := unicode.SimpleFold(rune) - for c != rune { - if unicode.IsUpper(c) { - return true - } - c = unicode.SimpleFold(c) - } - return false -} - -func TestFoldConstants(t *testing.T) { - last := -1 - for i := 0; i <= unicode.MaxRune; i++ { - if unicode.SimpleFold(i) == i { - continue - } - if last == -1 && minFold != i { - t.Errorf("minFold=%#U should be %#U", minFold, i) - } - last = i - } - if maxFold != last { - t.Errorf("maxFold=%#U should be %#U", maxFold, last) - } -} - -func TestAppendRangeCollapse(t *testing.T) { - // AppendRange should collapse each of the new ranges - // into the earlier ones (it looks back two ranges), so that - // the slice never grows very large. - // Note that we are not calling cleanClass. - var r []int - for i := 'A'; i <= 'Z'; i++ { - r = appendRange(r, i, i) - r = appendRange(r, i+'a'-'A', i+'a'-'A') - } - if string(r) != "AZaz" { - t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) - } -} diff --git a/libgo/go/exp/regexp/syntax/perl_groups.go b/libgo/go/exp/regexp/syntax/perl_groups.go deleted file mode 100644 index 05b392c..0000000 --- a/libgo/go/exp/regexp/syntax/perl_groups.go +++ /dev/null @@ -1,130 +0,0 @@ -// GENERATED BY make_perl_groups.pl; DO NOT EDIT. -// make_perl_groups.pl >perl_groups.go - -package syntax - -var code1 = []int{ /* \d */ - 0x30, 0x39, -} - -var code2 = []int{ /* \s */ - 0x9, 0xa, - 0xc, 0xd, - 0x20, 0x20, -} - -var code3 = []int{ /* \w */ - 0x30, 0x39, - 0x41, 0x5a, - 0x5f, 0x5f, - 0x61, 0x7a, -} - -var perlGroup = map[string]charGroup{ - `\d`: {+1, code1}, - `\D`: {-1, code1}, - `\s`: {+1, code2}, - `\S`: {-1, code2}, - `\w`: {+1, code3}, - `\W`: {-1, code3}, -} -var code4 = []int{ /* [:alnum:] */ - 0x30, 0x39, - 0x41, 0x5a, - 0x61, 0x7a, -} - -var code5 = []int{ /* [:alpha:] */ - 0x41, 0x5a, - 0x61, 0x7a, -} - -var code6 = []int{ /* [:ascii:] */ - 0x0, 0x7f, -} - -var code7 = []int{ /* [:blank:] */ - 0x9, 0x9, - 0x20, 0x20, -} - -var code8 = []int{ /* [:cntrl:] */ - 0x0, 0x1f, - 0x7f, 0x7f, -} - -var code9 = []int{ /* [:digit:] */ - 0x30, 0x39, -} - -var code10 = []int{ /* [:graph:] */ - 0x21, 0x7e, -} - -var code11 = []int{ /* [:lower:] */ - 0x61, 0x7a, -} - -var code12 = []int{ /* [:print:] */ - 0x20, 0x7e, -} - -var code13 = []int{ /* [:punct:] */ - 0x21, 0x2f, - 0x3a, 0x40, - 0x5b, 0x60, - 0x7b, 0x7e, -} - -var code14 = []int{ /* [:space:] */ - 0x9, 0xd, - 0x20, 0x20, -} - -var code15 = []int{ /* [:upper:] */ - 0x41, 0x5a, -} - -var code16 = []int{ /* [:word:] */ - 0x30, 0x39, - 0x41, 0x5a, - 0x5f, 0x5f, - 0x61, 0x7a, -} - -var code17 = []int{ /* [:xdigit:] */ - 0x30, 0x39, - 0x41, 0x46, - 0x61, 0x66, -} - -var posixGroup = map[string]charGroup{ - `[:alnum:]`: {+1, code4}, - `[:^alnum:]`: {-1, code4}, - `[:alpha:]`: {+1, code5}, - `[:^alpha:]`: {-1, code5}, - `[:ascii:]`: {+1, code6}, - `[:^ascii:]`: {-1, code6}, - `[:blank:]`: {+1, code7}, - `[:^blank:]`: {-1, code7}, - `[:cntrl:]`: {+1, code8}, - `[:^cntrl:]`: {-1, code8}, - `[:digit:]`: {+1, code9}, - `[:^digit:]`: {-1, code9}, - `[:graph:]`: {+1, code10}, - `[:^graph:]`: {-1, code10}, - `[:lower:]`: {+1, code11}, - `[:^lower:]`: {-1, code11}, - `[:print:]`: {+1, code12}, - `[:^print:]`: {-1, code12}, - `[:punct:]`: {+1, code13}, - `[:^punct:]`: {-1, code13}, - `[:space:]`: {+1, code14}, - `[:^space:]`: {-1, code14}, - `[:upper:]`: {+1, code15}, - `[:^upper:]`: {-1, code15}, - `[:word:]`: {+1, code16}, - `[:^word:]`: {-1, code16}, - `[:xdigit:]`: {+1, code17}, - `[:^xdigit:]`: {-1, code17}, -} diff --git a/libgo/go/exp/regexp/syntax/prog.go b/libgo/go/exp/regexp/syntax/prog.go deleted file mode 100644 index bf85b72..0000000 --- a/libgo/go/exp/regexp/syntax/prog.go +++ /dev/null @@ -1,237 +0,0 @@ -package syntax - -import ( - "bytes" - "strconv" -) - -// Compiled program. -// May not belong in this package, but convenient for now. - -// A Prog is a compiled regular expression program. -type Prog struct { - Inst []Inst - Start int // index of start instruction - NumCap int // number of InstCapture insts in re -} - -// An InstOp is an instruction opcode. -type InstOp uint8 - -const ( - InstAlt InstOp = iota - InstAltMatch - InstCapture - InstEmptyWidth - InstMatch - InstFail - InstNop - InstRune -) - -// An EmptyOp specifies a kind or mixture of zero-width assertions. -type EmptyOp uint8 - -const ( - EmptyBeginLine EmptyOp = 1 << iota - EmptyEndLine - EmptyBeginText - EmptyEndText - EmptyWordBoundary - EmptyNoWordBoundary -) - -// An Inst is a single instruction in a regular expression program. -type Inst struct { - Op InstOp - Out uint32 // all but InstMatch, InstFail - Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth - Rune []int -} - -func (p *Prog) String() string { - var b bytes.Buffer - dumpProg(&b, p) - return b.String() -} - -// skipNop follows any no-op or capturing instructions -// and returns the resulting pc. -func (p *Prog) skipNop(pc uint32) *Inst { - i := &p.Inst[pc] - for i.Op == InstNop || i.Op == InstCapture { - pc = i.Out - i = &p.Inst[pc] - } - return i -} - -// Prefix returns a literal string that all matches for the -// regexp must start with. Complete is true if the prefix -// is the entire match. -func (p *Prog) Prefix() (prefix string, complete bool) { - i := p.skipNop(uint32(p.Start)) - - // Avoid allocation of buffer if prefix is empty. - if i.Op != InstRune || len(i.Rune) != 1 { - return "", i.Op == InstMatch - } - - // Have prefix; gather characters. - var buf bytes.Buffer - for i.Op == InstRune && len(i.Rune) == 1 { - buf.WriteRune(i.Rune[0]) - i = p.skipNop(i.Out) - } - return buf.String(), i.Op == InstMatch -} - -// StartCond returns the leading empty-width conditions that must -// be true in any match. It returns ^EmptyOp(0) if no matches are possible. -func (p *Prog) StartCond() EmptyOp { - var flag EmptyOp - pc := uint32(p.Start) - i := &p.Inst[pc] -Loop: - for { - switch i.Op { - case InstEmptyWidth: - flag |= EmptyOp(i.Arg) - case InstFail: - return ^EmptyOp(0) - case InstCapture, InstNop: - // skip - default: - break Loop - } - pc = i.Out - i = &p.Inst[pc] - } - return flag -} - -// MatchRune returns true if the instruction matches (and consumes) r. -// It should only be called when i.Op == InstRune. -func (i *Inst) MatchRune(r int) bool { - rune := i.Rune - - // Special case: single-rune slice is from literal string, not char class. - // TODO: Case folding. - if len(rune) == 1 { - return r == rune[0] - } - - // Peek at the first few pairs. - // Should handle ASCII well. - for j := 0; j < len(rune) && j <= 8; j += 2 { - if r < rune[j] { - return false - } - if r <= rune[j+1] { - return true - } - } - - // Otherwise binary search. - lo := 0 - hi := len(rune) / 2 - for lo < hi { - m := lo + (hi-lo)/2 - if c := rune[2*m]; c <= r { - if r <= rune[2*m+1] { - return true - } - lo = m + 1 - } else { - hi = m - } - } - return false -} - -// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. -// Since we act on runes, it would be easy to support Unicode here. -func wordRune(rune int) bool { - return rune == '_' || - ('A' <= rune && rune <= 'Z') || - ('a' <= rune && rune <= 'z') || - ('0' <= rune && rune <= '9') -} - -// MatchEmptyWidth returns true if the instruction matches -// an empty string between the runes before and after. -// It should only be called when i.Op == InstEmptyWidth. -func (i *Inst) MatchEmptyWidth(before int, after int) bool { - switch EmptyOp(i.Arg) { - case EmptyBeginLine: - return before == '\n' || before == -1 - case EmptyEndLine: - return after == '\n' || after == -1 - case EmptyBeginText: - return before == -1 - case EmptyEndText: - return after == -1 - case EmptyWordBoundary: - return wordRune(before) != wordRune(after) - case EmptyNoWordBoundary: - return wordRune(before) == wordRune(after) - } - panic("unknown empty width arg") -} - -func (i *Inst) String() string { - var b bytes.Buffer - dumpInst(&b, i) - return b.String() -} - -func bw(b *bytes.Buffer, args ...string) { - for _, s := range args { - b.WriteString(s) - } -} - -func dumpProg(b *bytes.Buffer, p *Prog) { - for j := range p.Inst { - i := &p.Inst[j] - pc := strconv.Itoa(j) - if len(pc) < 3 { - b.WriteString(" "[len(pc):]) - } - if j == p.Start { - pc += "*" - } - bw(b, pc, "\t") - dumpInst(b, i) - bw(b, "\n") - } -} - -func u32(i uint32) string { - return strconv.Uitoa64(uint64(i)) -} - -func dumpInst(b *bytes.Buffer, i *Inst) { - switch i.Op { - case InstAlt: - bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) - case InstAltMatch: - bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) - case InstCapture: - bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) - case InstEmptyWidth: - bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) - case InstMatch: - bw(b, "match") - case InstFail: - bw(b, "fail") - case InstNop: - bw(b, "nop -> ", u32(i.Out)) - case InstRune: - if i.Rune == nil { - // shouldn't happen - bw(b, "rune <nil>") - } - bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) - } -} diff --git a/libgo/go/exp/regexp/syntax/prog_test.go b/libgo/go/exp/regexp/syntax/prog_test.go deleted file mode 100644 index 7be4281..0000000 --- a/libgo/go/exp/regexp/syntax/prog_test.go +++ /dev/null @@ -1,91 +0,0 @@ -package syntax - -import ( - "testing" -) - -var compileTests = []struct { - Regexp string - Prog string -}{ - {"a", ` 0 fail - 1* rune "a" -> 2 - 2 match -`}, - {"[A-M][n-z]", ` 0 fail - 1* rune "AM" -> 2 - 2 rune "nz" -> 3 - 3 match -`}, - {"", ` 0 fail - 1* nop -> 2 - 2 match -`}, - {"a?", ` 0 fail - 1 rune "a" -> 3 - 2* alt -> 1, 3 - 3 match -`}, - {"a??", ` 0 fail - 1 rune "a" -> 3 - 2* alt -> 3, 1 - 3 match -`}, - {"a+", ` 0 fail - 1* rune "a" -> 2 - 2 alt -> 1, 3 - 3 match -`}, - {"a+?", ` 0 fail - 1* rune "a" -> 2 - 2 alt -> 3, 1 - 3 match -`}, - {"a*", ` 0 fail - 1 rune "a" -> 2 - 2* alt -> 1, 3 - 3 match -`}, - {"a*?", ` 0 fail - 1 rune "a" -> 2 - 2* alt -> 3, 1 - 3 match -`}, - {"a+b+", ` 0 fail - 1* rune "a" -> 2 - 2 alt -> 1, 3 - 3 rune "b" -> 4 - 4 alt -> 3, 5 - 5 match -`}, - {"(a+)(b+)", ` 0 fail - 1* cap 2 -> 2 - 2 rune "a" -> 3 - 3 alt -> 2, 4 - 4 cap 3 -> 5 - 5 cap 4 -> 6 - 6 rune "b" -> 7 - 7 alt -> 6, 8 - 8 cap 5 -> 9 - 9 match -`}, - {"a+|b+", ` 0 fail - 1 rune "a" -> 2 - 2 alt -> 1, 6 - 3 rune "b" -> 4 - 4 alt -> 3, 6 - 5* alt -> 1, 3 - 6 match -`}, -} - -func TestCompile(t *testing.T) { - for _, tt := range compileTests { - re, _ := Parse(tt.Regexp, Perl) - p, _ := Compile(re) - s := p.String() - if s != tt.Prog { - t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) - } - } -} diff --git a/libgo/go/exp/regexp/syntax/regexp.go b/libgo/go/exp/regexp/syntax/regexp.go deleted file mode 100644 index 00a4add..0000000 --- a/libgo/go/exp/regexp/syntax/regexp.go +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package syntax parses regular expressions into syntax trees. -// WORK IN PROGRESS. -package syntax - -// Note to implementers: -// In this package, re is always a *Regexp and r is always a rune. - -import ( - "bytes" - "strconv" - "strings" - "unicode" -) - -// A Regexp is a node in a regular expression syntax tree. -type Regexp struct { - Op Op // operator - Flags Flags - Sub []*Regexp // subexpressions, if any - Sub0 [1]*Regexp // storage for short Sub - Rune []int // matched runes, for OpLiteral, OpCharClass - Rune0 [2]int // storage for short Rune - Min, Max int // min, max for OpRepeat - Cap int // capturing index, for OpCapture - Name string // capturing name, for OpCapture -} - -// An Op is a single regular expression operator. -type Op uint8 - -// Operators are listed in precedence order, tightest binding to weakest. -// Character class operators are listed simplest to most complex -// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). - -const ( - OpNoMatch Op = 1 + iota // matches no strings - OpEmptyMatch // matches empty string - OpLiteral // matches Runes sequence - OpCharClass // matches Runes interpreted as range pair list - OpAnyCharNotNL // matches any character - OpAnyChar // matches any character - OpBeginLine // matches empty string at beginning of line - OpEndLine // matches empty string at end of line - OpBeginText // matches empty string at beginning of text - OpEndText // matches empty string at end of text - OpWordBoundary // matches word boundary `\b` - OpNoWordBoundary // matches word non-boundary `\B` - OpCapture // capturing subexpression with index Cap, optional name Name - OpStar // matches Sub[0] zero or more times - OpPlus // matches Sub[0] one or more times - OpQuest // matches Sub[0] zero or one times - OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) - OpConcat // matches concatenation of Subs - OpAlternate // matches alternation of Subs -) - -const opPseudo Op = 128 // where pseudo-ops start - -// Equal returns true if x and y have identical structure. -func (x *Regexp) Equal(y *Regexp) bool { - if x == nil || y == nil { - return x == y - } - if x.Op != y.Op { - return false - } - switch x.Op { - case OpEndText: - // The parse flags remember whether this is \z or \Z. - if x.Flags&WasDollar != y.Flags&WasDollar { - return false - } - - case OpLiteral, OpCharClass: - if len(x.Rune) != len(y.Rune) { - return false - } - for i, r := range x.Rune { - if r != y.Rune[i] { - return false - } - } - - case OpAlternate, OpConcat: - if len(x.Sub) != len(y.Sub) { - return false - } - for i, sub := range x.Sub { - if !sub.Equal(y.Sub[i]) { - return false - } - } - - case OpStar, OpPlus, OpQuest: - if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { - return false - } - - case OpRepeat: - if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { - return false - } - - case OpCapture: - if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { - return false - } - } - return true -} - -// writeRegexp writes the Perl syntax for the regular expression re to b. -func writeRegexp(b *bytes.Buffer, re *Regexp) { - switch re.Op { - default: - b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") - case OpNoMatch: - b.WriteString(`[^\x00-\x{10FFFF}]`) - case OpEmptyMatch: - b.WriteString(`(?:)`) - case OpLiteral: - if re.Flags&FoldCase != 0 { - b.WriteString(`(?i:`) - } - for _, r := range re.Rune { - escape(b, r, false) - } - if re.Flags&FoldCase != 0 { - b.WriteString(`)`) - } - case OpCharClass: - if len(re.Rune)%2 != 0 { - b.WriteString(`[invalid char class]`) - break - } - b.WriteRune('[') - if len(re.Rune) == 0 { - b.WriteString(`^\x00-\x{10FFFF}`) - } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { - // Contains 0 and MaxRune. Probably a negated class. - // Print the gaps. - b.WriteRune('^') - for i := 1; i < len(re.Rune)-1; i += 2 { - lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 - escape(b, lo, lo == '-') - if lo != hi { - b.WriteRune('-') - escape(b, hi, hi == '-') - } - } - } else { - for i := 0; i < len(re.Rune); i += 2 { - lo, hi := re.Rune[i], re.Rune[i+1] - escape(b, lo, lo == '-') - if lo != hi { - b.WriteRune('-') - escape(b, hi, hi == '-') - } - } - } - b.WriteRune(']') - case OpAnyCharNotNL: - b.WriteString(`[^\n]`) - case OpAnyChar: - b.WriteRune('.') - case OpBeginLine: - b.WriteRune('^') - case OpEndLine: - b.WriteRune('$') - case OpBeginText: - b.WriteString(`\A`) - case OpEndText: - b.WriteString(`\z`) - case OpWordBoundary: - b.WriteString(`\b`) - case OpNoWordBoundary: - b.WriteString(`\B`) - case OpCapture: - if re.Name != "" { - b.WriteString(`(?P<`) - b.WriteString(re.Name) - b.WriteRune('>') - } else { - b.WriteRune('(') - } - if re.Sub[0].Op != OpEmptyMatch { - writeRegexp(b, re.Sub[0]) - } - b.WriteRune(')') - case OpStar, OpPlus, OpQuest, OpRepeat: - if sub := re.Sub[0]; sub.Op > OpCapture { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) - } - switch re.Op { - case OpStar: - b.WriteRune('*') - case OpPlus: - b.WriteRune('+') - case OpQuest: - b.WriteRune('?') - case OpRepeat: - b.WriteRune('{') - b.WriteString(strconv.Itoa(re.Min)) - if re.Max != re.Min { - b.WriteRune(',') - if re.Max >= 0 { - b.WriteString(strconv.Itoa(re.Max)) - } - } - b.WriteRune('}') - } - case OpConcat: - for _, sub := range re.Sub { - if sub.Op == OpAlternate { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) - } - } - case OpAlternate: - for i, sub := range re.Sub { - if i > 0 { - b.WriteRune('|') - } - writeRegexp(b, sub) - } - } -} - -func (re *Regexp) String() string { - var b bytes.Buffer - writeRegexp(&b, re) - return b.String() -} - -const meta = `\.+*?()|[]{}^$` - -func escape(b *bytes.Buffer, r int, force bool) { - if unicode.IsPrint(r) { - if strings.IndexRune(meta, r) >= 0 || force { - b.WriteRune('\\') - } - b.WriteRune(r) - return - } - - switch r { - case '\a': - b.WriteString(`\a`) - case '\f': - b.WriteString(`\f`) - case '\n': - b.WriteString(`\n`) - case '\r': - b.WriteString(`\r`) - case '\t': - b.WriteString(`\t`) - case '\v': - b.WriteString(`\v`) - default: - if r < 0x100 { - b.WriteString(`\x`) - s := strconv.Itob(r, 16) - if len(s) == 1 { - b.WriteRune('0') - } - b.WriteString(s) - break - } - b.WriteString(`\x{`) - b.WriteString(strconv.Itob(r, 16)) - b.WriteString(`}`) - } -} diff --git a/libgo/go/exp/regexp/syntax/simplify.go b/libgo/go/exp/regexp/syntax/simplify.go deleted file mode 100644 index 7239041..0000000 --- a/libgo/go/exp/regexp/syntax/simplify.go +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syntax - -// Simplify returns a regexp equivalent to re but without counted repetitions -// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. -// The resulting regexp will execute correctly but its string representation -// will not produce the same parse tree, because capturing parentheses -// may have been duplicated or removed. For example, the simplified form -// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. -// The returned regexp may share structure with or be the original. -func (re *Regexp) Simplify() *Regexp { - if re == nil { - return nil - } - switch re.Op { - case OpCapture, OpConcat, OpAlternate: - // Simplify children, building new Regexp if children change. - nre := re - for i, sub := range re.Sub { - nsub := sub.Simplify() - if nre == re && nsub != sub { - // Start a copy. - nre = new(Regexp) - *nre = *re - nre.Rune = nil - nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) - } - if nre != re { - nre.Sub = append(nre.Sub, nsub) - } - } - return nre - - case OpStar, OpPlus, OpQuest: - sub := re.Sub[0].Simplify() - return simplify1(re.Op, re.Flags, sub, re) - - case OpRepeat: - // Special special case: x{0} matches the empty string - // and doesn't even need to consider x. - if re.Min == 0 && re.Max == 0 { - return &Regexp{Op: OpEmptyMatch} - } - - // The fun begins. - sub := re.Sub[0].Simplify() - - // x{n,} means at least n matches of x. - if re.Max == -1 { - // Special case: x{0,} is x*. - if re.Min == 0 { - return simplify1(OpStar, re.Flags, sub, nil) - } - - // Special case: x{1,} is x+. - if re.Min == 1 { - return simplify1(OpPlus, re.Flags, sub, nil) - } - - // General case: x{4,} is xxxx+. - nre := &Regexp{Op: OpConcat} - nre.Sub = nre.Sub0[:0] - for i := 0; i < re.Min-1; i++ { - nre.Sub = append(nre.Sub, sub) - } - nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) - return nre - } - - // Special case x{0} handled above. - - // Special case: x{1} is just x. - if re.Min == 1 && re.Max == 1 { - return sub - } - - // General case: x{n,m} means n copies of x and m copies of x? - // The machine will do less work if we nest the final m copies, - // so that x{2,5} = xx(x(x(x)?)?)? - - // Build leading prefix: xx. - var prefix *Regexp - if re.Min > 0 { - prefix = &Regexp{Op: OpConcat} - prefix.Sub = prefix.Sub0[:0] - for i := 0; i < re.Min; i++ { - prefix.Sub = append(prefix.Sub, sub) - } - } - - // Build and attach suffix: (x(x(x)?)?)? - if re.Max > re.Min { - suffix := simplify1(OpQuest, re.Flags, sub, nil) - for i := re.Min + 1; i < re.Max; i++ { - nre2 := &Regexp{Op: OpConcat} - nre2.Sub = append(nre2.Sub0[:0], sub, suffix) - suffix = simplify1(OpQuest, re.Flags, nre2, nil) - } - if prefix == nil { - return suffix - } - prefix.Sub = append(prefix.Sub, suffix) - } - if prefix != nil { - return prefix - } - - // Some degenerate case like min > max or min < max < 0. - // Handle as impossible match. - return &Regexp{Op: OpNoMatch} - } - - return re -} - -// simplify1 implements Simplify for the unary OpStar, -// OpPlus, and OpQuest operators. It returns the simple regexp -// equivalent to -// -// Regexp{Op: op, Flags: flags, Sub: {sub}} -// -// under the assumption that sub is already simple, and -// without first allocating that structure. If the regexp -// to be returned turns out to be equivalent to re, simplify1 -// returns re instead. -// -// simplify1 is factored out of Simplify because the implementation -// for other operators generates these unary expressions. -// Letting them call simplify1 makes sure the expressions they -// generate are simple. -func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { - // Special case: repeat the empty string as much as - // you want, but it's still the empty string. - if sub.Op == OpEmptyMatch { - return sub - } - // The operators are idempotent if the flags match. - if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { - return sub - } - if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { - return re - } - - re = &Regexp{Op: op, Flags: flags} - re.Sub = append(re.Sub0[:0], sub) - return re -} diff --git a/libgo/go/exp/regexp/syntax/simplify_test.go b/libgo/go/exp/regexp/syntax/simplify_test.go deleted file mode 100644 index c8cec21..0000000 --- a/libgo/go/exp/regexp/syntax/simplify_test.go +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syntax - -import "testing" - -var simplifyTests = []struct { - Regexp string - Simple string -}{ - // Already-simple constructs - {`a`, `a`}, - {`ab`, `ab`}, - {`a|b`, `[a-b]`}, - {`ab|cd`, `ab|cd`}, - {`(ab)*`, `(ab)*`}, - {`(ab)+`, `(ab)+`}, - {`(ab)?`, `(ab)?`}, - {`.`, `.`}, - {`^`, `^`}, - {`$`, `$`}, - {`[ac]`, `[ac]`}, - {`[^ac]`, `[^ac]`}, - - // Posix character classes - {`[[:alnum:]]`, `[0-9A-Za-z]`}, - {`[[:alpha:]]`, `[A-Za-z]`}, - {`[[:blank:]]`, `[\t ]`}, - {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, - {`[[:digit:]]`, `[0-9]`}, - {`[[:graph:]]`, `[!-~]`}, - {`[[:lower:]]`, `[a-z]`}, - {`[[:print:]]`, `[ -~]`}, - {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, - {`[[:space:]]`, `[\t-\r ]`}, - {`[[:upper:]]`, `[A-Z]`}, - {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, - - // Perl character classes - {`\d`, `[0-9]`}, - {`\s`, `[\t-\n\f-\r ]`}, - {`\w`, `[0-9A-Z_a-z]`}, - {`\D`, `[^0-9]`}, - {`\S`, `[^\t-\n\f-\r ]`}, - {`\W`, `[^0-9A-Z_a-z]`}, - {`[\d]`, `[0-9]`}, - {`[\s]`, `[\t-\n\f-\r ]`}, - {`[\w]`, `[0-9A-Z_a-z]`}, - {`[\D]`, `[^0-9]`}, - {`[\S]`, `[^\t-\n\f-\r ]`}, - {`[\W]`, `[^0-9A-Z_a-z]`}, - - // Posix repetitions - {`a{1}`, `a`}, - {`a{2}`, `aa`}, - {`a{5}`, `aaaaa`}, - {`a{0,1}`, `a?`}, - // The next three are illegible because Simplify inserts (?:) - // parens instead of () parens to avoid creating extra - // captured subexpressions. The comments show a version with fewer parens. - {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? - {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? - {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? - {`a{0,2}`, `(?:aa?)?`}, // (aa?)? - {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? - {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? - {`a{0,}`, `a*`}, - {`a{1,}`, `a+`}, - {`a{2,}`, `aa+`}, - {`a{5,}`, `aaaaa+`}, - - // Test that operators simplify their arguments. - {`(?:a{1,}){1,}`, `a+`}, - {`(a{1,}b{1,})`, `(a+b+)`}, - {`a{1,}|b{1,}`, `a+|b+`}, - {`(?:a{1,})*`, `(?:a+)*`}, - {`(?:a{1,})+`, `a+`}, - {`(?:a{1,})?`, `(?:a+)?`}, - {``, `(?:)`}, - {`a{0}`, `(?:)`}, - - // Character class simplification - {`[ab]`, `[a-b]`}, - {`[a-za-za-z]`, `[a-z]`}, - {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, - {`[ABCDEFGH]`, `[A-H]`}, - {`[AB-CD-EF-GH]`, `[A-H]`}, - {`[W-ZP-XE-R]`, `[E-Z]`}, - {`[a-ee-gg-m]`, `[a-m]`}, - {`[a-ea-ha-m]`, `[a-m]`}, - {`[a-ma-ha-e]`, `[a-m]`}, - {`[a-zA-Z0-9 -~]`, `[ -~]`}, - - // Empty character classes - {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, - - // Full character classes - {`[[:cntrl:][:^cntrl:]]`, `.`}, - - // Unicode case folding. - {`(?i)A`, `(?i:A)`}, - {`(?i)a`, `(?i:a)`}, - {`(?i)[A]`, `(?i:A)`}, - {`(?i)[a]`, `(?i:A)`}, - {`(?i)K`, `(?i:K)`}, - {`(?i)k`, `(?i:k)`}, - {`(?i)\x{212a}`, "(?i:\u212A)"}, - {`(?i)[K]`, "[Kk\u212A]"}, - {`(?i)[k]`, "[Kk\u212A]"}, - {`(?i)[\x{212a}]`, "[Kk\u212A]"}, - {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, - {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, - {`(?i)[\x00-\x{10FFFF}]`, `.`}, - - // Empty string as a regular expression. - // The empty string must be preserved inside parens in order - // to make submatches work right, so these tests are less - // interesting than they might otherwise be. String inserts - // explicit (?:) in place of non-parenthesized empty strings, - // to make them easier to spot for other parsers. - {`(a|b|)`, `([a-b]|(?:))`}, - {`(|)`, `()`}, - {`a()`, `a()`}, - {`(()|())`, `(()|())`}, - {`(a|)`, `(a|(?:))`}, - {`ab()cd()`, `ab()cd()`}, - {`()`, `()`}, - {`()*`, `()*`}, - {`()+`, `()+`}, - {`()?`, `()?`}, - {`(){0}`, `(?:)`}, - {`(){1}`, `()`}, - {`(){1,}`, `()+`}, - {`(){0,2}`, `(?:()()?)?`}, -} - -func TestSimplify(t *testing.T) { - for _, tt := range simplifyTests { - re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) - if err != nil { - t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) - continue - } - s := re.Simplify().String() - if s != tt.Simple { - t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) - } - } -} |