diff options
Diffstat (limited to 'libgo/go/exp/regexp')
-rw-r--r-- | libgo/go/exp/regexp/all_test.go | 429 | ||||
-rw-r--r-- | libgo/go/exp/regexp/exec.go | 295 | ||||
-rw-r--r-- | libgo/go/exp/regexp/find_test.go | 472 | ||||
-rw-r--r-- | libgo/go/exp/regexp/regexp.go | 795 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/compile.go | 269 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/parse.go | 1797 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/parse_test.go | 350 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/perl_groups.go | 130 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/prog.go | 237 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/prog_test.go | 91 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/regexp.go | 284 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/simplify.go | 151 | ||||
-rw-r--r-- | libgo/go/exp/regexp/syntax/simplify_test.go | 151 |
13 files changed, 5451 insertions, 0 deletions
diff --git a/libgo/go/exp/regexp/all_test.go b/libgo/go/exp/regexp/all_test.go new file mode 100644 index 0000000..77f32ca --- /dev/null +++ b/libgo/go/exp/regexp/all_test.go @@ -0,0 +1,429 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "os" + "strings" + "testing" +) + +var good_re = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[abc]`, + `[^1234]`, + `[^\n]`, + `\!\\`, +} + +/* +type stringError struct { + re string + err os.Error +} + +var bad_re = []stringError{ + {`*`, ErrBareClosure}, + {`+`, ErrBareClosure}, + {`?`, ErrBareClosure}, + {`(abc`, ErrUnmatchedLpar}, + {`abc)`, ErrUnmatchedRpar}, + {`x[a-z`, ErrUnmatchedLbkt}, + {`abc]`, ErrUnmatchedRbkt}, + {`[z-a]`, ErrBadRange}, + {`abc\`, ErrExtraneousBackslash}, + {`a**`, ErrBadClosure}, + {`a*+`, ErrBadClosure}, + {`a??`, ErrBadClosure}, + {`\x`, ErrBadBackslash}, +} +*/ + +func compileTest(t *testing.T, expr string, error os.Error) *Regexp { + re, err := Compile(expr) + if err != error { + t.Error("compiling `", expr, "`; unexpected error: ", err.String()) + } + return re +} + +func TestGoodCompile(t *testing.T) { + for i := 0; i < len(good_re); i++ { + compileTest(t, good_re[i], nil) + } +} + +/* +func TestBadCompile(t *testing.T) { + for i := 0; i < len(bad_re); i++ { + compileTest(t, bad_re[i].re, bad_re[i].err) + } +} +*/ + +func matchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, nil) + if re == nil { + return + } + m := re.MatchString(test.text) + if m != (len(test.matches) > 0) { + t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } + // now try bytes + m = re.Match([]byte(test.text)) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatch(t *testing.T) { + for _, test := range findTests { + matchTest(t, &test) + } +} + +func matchFunctionTest(t *testing.T, test *FindTest) { + m, err := MatchString(test.pat, test.text) + if err == nil { + return + } + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatchFunction(t *testing.T) { + for _, test := range findTests { + matchFunctionTest(t, &test) + } +} + +type ReplaceTest struct { + pattern, replacement, input, output string +} + +var replaceTests = []ReplaceTest{ + // Test empty input and/or replacement, with pattern that matches the empty string. + {"", "", "", ""}, + {"", "x", "", "x"}, + {"", "", "abc", "abc"}, + {"", "x", "abc", "xaxbxcx"}, + + // Test empty input and/or replacement, with pattern that does not match the empty string. + {"b", "", "", ""}, + {"b", "x", "", ""}, + {"b", "", "abc", "ac"}, + {"b", "x", "abc", "axc"}, + {"y", "", "", ""}, + {"y", "x", "", ""}, + {"y", "", "abc", "abc"}, + {"y", "x", "abc", "abc"}, + + // Multibyte characters -- verify that we don't try to match in the middle + // of a character. + {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, + {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, + + // Start and end of a string. + {"^[a-c]*", "x", "abcdabc", "xdabc"}, + {"[a-c]*$", "x", "abcdabc", "abcdx"}, + {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]*", "x", "abc", "x"}, + {"[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*", "x", "dabce", "xdabce"}, + {"[a-c]*$", "x", "dabce", "dabcex"}, + {"^[a-c]*$", "x", "dabce", "dabce"}, + {"^[a-c]*", "x", "", "x"}, + {"[a-c]*$", "x", "", "x"}, + {"^[a-c]*$", "x", "", "x"}, + + {"^[a-c]+", "x", "abcdabc", "xdabc"}, + {"[a-c]+$", "x", "abcdabc", "abcdx"}, + {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]+", "x", "abc", "x"}, + {"[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+", "x", "dabce", "dabce"}, + {"[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+", "x", "", ""}, + {"[a-c]+$", "x", "", ""}, + {"^[a-c]+$", "x", "", ""}, + + // Other cases. + {"abc", "def", "abcdefg", "defdefg"}, + {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, + {"abc", "", "abcdabc", "d"}, + {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, + {"abc", "d", "", ""}, + {"abc", "d", "abc", "d"}, + {".+", "x", "abc", "x"}, + {"[a-c]*", "x", "def", "xdxexfx"}, + {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, + {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, +} + +type ReplaceFuncTest struct { + pattern string + replacement func(string) string + input, output string +} + +var replaceFuncTests = []ReplaceFuncTest{ + {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, + {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, + {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, +} + +func TestReplaceAll(t *testing.T) { + for _, tc := range replaceTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllFunc(t *testing.T) { + for _, tc := range replaceFuncTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +type MetaTest struct { + pattern, output, literal string + isLiteral bool +} + +var metaTests = []MetaTest{ + {``, ``, ``, true}, + {`foo`, `foo`, `foo`, true}, + {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator + {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, +} + +func TestQuoteMeta(t *testing.T) { + for _, tc := range metaTests { + // Verify that QuoteMeta returns the expected string. + quoted := QuoteMeta(tc.pattern) + if quoted != tc.output { + t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", + tc.pattern, quoted, tc.output) + continue + } + + // Verify that the quoted string is in fact treated as expected + // by Compile -- i.e. that it matches the original, unquoted string. + if tc.pattern != "" { + re, err := Compile(quoted) + if err != nil { + t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) + continue + } + src := "abc" + tc.pattern + "def" + repl := "xyz" + replaced := re.ReplaceAllString(src, repl) + expected := "abcxyzdef" + if replaced != expected { + t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", + tc.pattern, src, repl, replaced, expected) + } + } + } +} + +func TestLiteralPrefix(t *testing.T) { + for _, tc := range metaTests { + // Literal method needs to scan the pattern. + re := MustCompile(tc.pattern) + str, complete := re.LiteralPrefix() + if complete != tc.isLiteral { + t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) + } + if str != tc.literal { + t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) + } + } +} + +type numSubexpCase struct { + input string + expected int +} + +var numSubexpCases = []numSubexpCase{ + {``, 0}, + {`.*`, 0}, + {`abba`, 0}, + {`ab(b)a`, 1}, + {`ab(.*)a`, 1}, + {`(.*)ab(.*)a`, 2}, + {`(.*)(ab)(.*)a`, 3}, + {`(.*)((a)b)(.*)a`, 4}, + {`(.*)(\(ab)(.*)a`, 3}, + {`(.*)(\(a\)b)(.*)a`, 3}, +} + +func TestNumSubexp(t *testing.T) { + for _, c := range numSubexpCases { + re := MustCompile(c.input) + n := re.NumSubexp() + if n != c.expected { + t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) + } + } +} + +func BenchmarkLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile("y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkNotLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile(".y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass(b *testing.B) { + b.StopTimer() + x := strings.Repeat("xxxx", 20) + "w" + re := MustCompile("[abcdw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass_InRange(b *testing.B) { + b.StopTimer() + // 'b' is between 'a' and 'c', so the charclass + // range checking is no help here. + x := strings.Repeat("bbbb", 20) + "c" + re := MustCompile("[ac]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkReplaceAll(b *testing.B) { + x := "abcdefghijklmnopqrstuvwxyz" + b.StopTimer() + re := MustCompile("[cjrw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.ReplaceAllString(x, "") + } +} + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/libgo/go/exp/regexp/exec.go b/libgo/go/exp/regexp/exec.go new file mode 100644 index 0000000..0670bb9 --- /dev/null +++ b/libgo/go/exp/regexp/exec.go @@ -0,0 +1,295 @@ +package regexp + +import "exp/regexp/syntax" + +// A queue is a 'sparse array' holding pending threads of execution. +// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html +type queue struct { + sparse []uint32 + dense []entry +} + +// A entry is an entry on a queue. +// It holds both the instruction pc and the actual thread. +// Some queue entries are just place holders so that the machine +// knows it has considered that pc. Such entries have t == nil. +type entry struct { + pc uint32 + t *thread +} + +// A thread is the state of a single path through the machine: +// an instruction and a corresponding capture array. +// See http://swtch.com/~rsc/regexp/regexp2.html +type thread struct { + inst *syntax.Inst + cap []int +} + +// A machine holds all the state during an NFA simulation for p. +type machine struct { + re *Regexp // corresponding Regexp + p *syntax.Prog // compiled program + q0, q1 queue // two queues for runq, nextq + pool []*thread // pool of available threads + matched bool // whether a match was found + matchcap []int // capture information for the match +} + +// progMachine returns a new machine running the prog p. +func progMachine(p *syntax.Prog) *machine { + m := &machine{p: p} + n := len(m.p.Inst) + m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} + m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} + ncap := p.NumCap + if ncap < 2 { + ncap = 2 + } + m.matchcap = make([]int, ncap) + return m +} + +// alloc allocates a new thread with the given instruction. +// It uses the free pool if possible. +func (m *machine) alloc(i *syntax.Inst) *thread { + var t *thread + if n := len(m.pool); n > 0 { + t = m.pool[n-1] + m.pool = m.pool[:n-1] + } else { + t = new(thread) + t.cap = make([]int, cap(m.matchcap)) + } + t.cap = t.cap[:len(m.matchcap)] + t.inst = i + return t +} + +// free returns t to the free pool. +func (m *machine) free(t *thread) { + m.pool = append(m.pool, t) +} + +// match runs the machine over the input starting at pos. +// It reports whether a match was found. +// If so, m.matchcap holds the submatch information. +func (m *machine) match(i input, pos int) bool { + startCond := m.re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return false + } + m.matched = false + for i := range m.matchcap { + m.matchcap[i] = -1 + } + runq, nextq := &m.q0, &m.q1 + rune, rune1 := endOfText, endOfText + width, width1 := 0, 0 + rune, width = i.step(pos) + if rune != endOfText { + rune1, width1 = i.step(pos + width) + } + // TODO: Let caller specify the initial flag setting. + // For now assume pos == 0 is beginning of text and + // pos != 0 is not even beginning of line. + // TODO: Word boundary. + var flag syntax.EmptyOp + if pos == 0 { + flag = syntax.EmptyBeginText | syntax.EmptyBeginLine + } + + // Update flag using lookahead rune. + if rune1 == '\n' { + flag |= syntax.EmptyEndLine + } + if rune1 == endOfText { + flag |= syntax.EmptyEndText + } + + for { + if len(runq.dense) == 0 { + if startCond&syntax.EmptyBeginText != 0 && pos != 0 { + // Anchored match, past beginning of text. + break + } + if m.matched { + // Have match; finished exploring alternatives. + break + } + if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() { + // Match requires literal prefix; fast search for it. + advance := i.index(m.re, pos) + if advance < 0 { + break + } + pos += advance + rune, width = i.step(pos) + rune1, width1 = i.step(pos + width) + } + } + if !m.matched { + if len(m.matchcap) > 0 { + m.matchcap[0] = pos + } + m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag) + } + // TODO: word boundary + flag = 0 + if rune == '\n' { + flag |= syntax.EmptyBeginLine + } + if rune1 == '\n' { + flag |= syntax.EmptyEndLine + } + if rune1 == endOfText { + flag |= syntax.EmptyEndText + } + m.step(runq, nextq, pos, pos+width, rune, flag) + if width == 0 { + break + } + pos += width + rune, width = rune1, width1 + if rune != endOfText { + rune1, width1 = i.step(pos + width) + } + runq, nextq = nextq, runq + } + m.clear(nextq) + return m.matched +} + +// clear frees all threads on the thread queue. +func (m *machine) clear(q *queue) { + for _, d := range q.dense { + if d.t != nil { + m.free(d.t) + } + } + q.dense = q.dense[:0] +} + +// step executes one step of the machine, running each of the threads +// on runq and appending new threads to nextq. +// The step processes the rune c (which may be endOfText), +// which starts at position pos and ends at nextPos. +// nextCond gives the setting for the empty-width flags after c. +func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) { + for j := 0; j < len(runq.dense); j++ { + d := &runq.dense[j] + t := d.t + if t == nil { + continue + } + /* + * If we support leftmost-longest matching: + if longest && matched && match[0] < t.cap[0] { + m.free(t) + continue + } + */ + + i := t.inst + switch i.Op { + default: + panic("bad inst") + + case syntax.InstMatch: + if len(t.cap) > 0 { + t.cap[1] = pos + copy(m.matchcap, t.cap) + } + m.matched = true + for _, d := range runq.dense[j+1:] { + if d.t != nil { + m.free(d.t) + } + } + runq.dense = runq.dense[:0] + + case syntax.InstRune: + if i.MatchRune(c) { + m.add(nextq, i.Out, nextPos, t.cap, nextCond) + } + } + m.free(t) + } + runq.dense = runq.dense[:0] +} + +// add adds an entry to q for pc, unless the q already has such an entry. +// It also recursively adds an entry for all instructions reachable from pc by following +// empty-width conditions satisfied by cond. pos gives the current position +// in the input. +func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) { + if pc == 0 { + return + } + if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { + return + } + + j := len(q.dense) + q.dense = q.dense[:j+1] + d := &q.dense[j] + d.t = nil + d.pc = pc + q.sparse[pc] = uint32(j) + + i := &m.p.Inst[pc] + switch i.Op { + default: + panic("unhandled") + case syntax.InstFail: + // nothing + case syntax.InstAlt, syntax.InstAltMatch: + m.add(q, i.Out, pos, cap, cond) + m.add(q, i.Arg, pos, cap, cond) + case syntax.InstEmptyWidth: + if syntax.EmptyOp(i.Arg)&^cond == 0 { + m.add(q, i.Out, pos, cap, cond) + } + case syntax.InstNop: + m.add(q, i.Out, pos, cap, cond) + case syntax.InstCapture: + if int(i.Arg) < len(cap) { + opos := cap[i.Arg] + cap[i.Arg] = pos + m.add(q, i.Out, pos, cap, cond) + cap[i.Arg] = opos + } else { + m.add(q, i.Out, pos, cap, cond) + } + case syntax.InstMatch, syntax.InstRune: + t := m.alloc(i) + if len(t.cap) > 0 { + copy(t.cap, cap) + } + d.t = t + } +} + +// empty is a non-nil 0-element slice, +// so doExecute can avoid an allocation +// when 0 captures are requested from a successful match. +var empty = make([]int, 0) + +// doExecute finds the leftmost match in the input and returns +// the position of its subexpressions. +func (re *Regexp) doExecute(i input, pos int, ncap int) []int { + m := re.get() + m.matchcap = m.matchcap[:ncap] + if !m.match(i, pos) { + re.put(m) + return nil + } + if ncap == 0 { + re.put(m) + return empty // empty but not nil + } + cap := make([]int, ncap) + copy(cap, m.matchcap) + re.put(m) + return cap +} diff --git a/libgo/go/exp/regexp/find_test.go b/libgo/go/exp/regexp/find_test.go new file mode 100644 index 0000000..dddc348 --- /dev/null +++ b/libgo/go/exp/regexp/find_test.go @@ -0,0 +1,472 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "fmt" + "strings" + "testing" +) + +// For each pattern/text pair, what is the expected output of each function? +// We can derive the textual results from the indexed results, the non-submatch +// results from the submatched results, the single results from the 'all' results, +// and the byte results from the string results. Therefore the table includes +// only the FindAllStringSubmatchIndex result. +type FindTest struct { + pat string + text string + matches [][]int +} + +func (t FindTest) String() string { + return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) +} + +var findTests = []FindTest{ + {``, ``, build(1, 0, 0)}, + {`^abcdefg`, "abcdefg", build(1, 0, 7)}, + {`a+`, "baaab", build(1, 1, 4)}, + {"abcd..", "abcdef", build(1, 0, 6)}, + {`a`, "a", build(1, 0, 1)}, + {`x`, "y", nil}, + {`b`, "abc", build(1, 1, 2)}, + {`.`, "a", build(1, 0, 1)}, + {`.*`, "abcdef", build(1, 0, 6)}, + {`^`, "abcde", build(1, 0, 0)}, + {`$`, "abcde", build(1, 5, 5)}, + {`^abcd$`, "abcd", build(1, 0, 4)}, + {`^bcd'`, "abcdef", nil}, + {`^abcd$`, "abcde", nil}, + {`a+`, "baaab", build(1, 1, 4)}, + {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, + {`[a-z]+`, "abcd", build(1, 0, 4)}, + {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, + {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, + {`[^\n]+`, "abcd\n", build(1, 0, 4)}, + {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, + {`日本語+`, "日本語", build(1, 0, 9)}, + {`日本語+`, "日本語語語語", build(1, 0, 18)}, + {`()`, "", build(1, 0, 0, 0, 0)}, + {`(a)`, "a", build(1, 0, 1, 0, 1)}, + {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, + {`(.*)`, "", build(1, 0, 0, 0, 0)}, + {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, + {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, + {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, + {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, + {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, + {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, + {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, + + {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, + {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, + {`[.]`, ".", build(1, 0, 1)}, + {`/$`, "/abc/", build(1, 4, 5)}, + {`/$`, "/abc", nil}, + + // multiple matches + {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, + {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, + {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, + {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, + {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, + + // fixed bugs + {`ab$`, "cab", build(1, 1, 3)}, + {`axxb$`, "axxcb", nil}, + {`data`, "daXY data", build(1, 5, 9)}, + {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + {`zx+`, "zzx", build(1, 1, 3)}, + + // can backslash-escape any punctuation + {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {"\\`", "`", build(1, 0, 1)}, + {"[\\`]+", "`", build(1, 0, 1)}, + + // long set of matches (longer than startSize) + { + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, + 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, + 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + }, +} + +// build is a helper to construct a [][]int by extracting n sequences from x. +// This represents n matches with len(x)/n submatches each. +func build(n int, x ...int) [][]int { + ret := make([][]int, n) + runLength := len(x) / n + j := 0 + for i := range ret { + ret[i] = make([]int, runLength) + copy(ret[i], x[j:]) + j += runLength + if j > len(x) { + panic("invalid build entry") + } + } + return ret +} + +// First the simple cases. + +func TestFind(t *testing.T) { + for _, test := range findTests { + re := MustCompile(test.pat) + if re.String() != test.pat { + t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) + } + result := re.Find([]byte(test.text)) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != string(result) { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func TestFindString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindString(test.text) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != "": + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == "": + // Tricky because an empty result has two meanings: no match or empty match. + if test.matches[0][0] != test.matches[0][1] { + t.Errorf("expected match; got none: %s", test) + } + case test.matches != nil && result != "": + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != result { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func testFindIndex(test *FindTest, result []int, t *testing.T) { + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.matches[0] + if expect[0] != result[0] || expect[1] != result[1] { + t.Errorf("expected %v got %v: %s", expect, result, test) + } + } +} + +func TestFindIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) + } +} + +func TestFindStringIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) + } +} + +func TestFindReaderIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the simple All cases. + +func TestFindAll(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAll([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Fatalf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != string(result[k]) { + t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) + } + } + } + } +} + +func TestFindAllString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllString(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != result[k] { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } + } +} + +func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + return + } + for k, e := range test.matches { + if e[0] != result[k][0] || e[1] != result[k][1] { + t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) + } + } + } +} + +func TestFindAllIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) + } +} + +// Now come the Submatch cases. + +func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != nil { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != string(result[k/2]) { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchBytes(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != "" { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != result[k/2] { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindStringSubmatch(test.text) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchString(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { + if len(expect) != len(result) { + t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) + return + } + for k, e := range expect { + if e != result[k] { + t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) + } + } +} + +func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchIndices(test, 0, test.matches[0], result, t) + } +} + +func TestFindSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) + } +} + +func TestFindStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) + } +} + +func TestFindReaderSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the monster AllSubmatch cases. + +func TestFindAllSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchBytes(&test, k, match, result[k], t) + } + } + } +} + +func TestFindAllStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchString(&test, k, match, result[k], t) + } + } + } +} + +func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchIndices(test, k, match, result[k], t) + } + } +} + +func TestFindAllSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) + } +} diff --git a/libgo/go/exp/regexp/regexp.go b/libgo/go/exp/regexp/regexp.go new file mode 100644 index 0000000..1b75900 --- /dev/null +++ b/libgo/go/exp/regexp/regexp.go @@ -0,0 +1,795 @@ +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package regexp implements a simple regular expression library. +// +// The syntax of the regular expressions accepted is the same +// general syntax used by Perl, Python, and other languages. +// More precisely, it is the syntax accepted by RE2 and described at +// http://code.google.com/p/re2/wiki/Syntax, except for \C. +// +// All characters are UTF-8-encoded code points. +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text. Their names are matched by this regular expression: +// +// Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression. Empty matches abutting a preceding +// match are ignored. The return value is a slice containing the successive +// return values of the corresponding non-'All' routine. These routines take +// an extra integer argument, n; if n >= 0, the function returns at most n +// matches/submatches. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression. Submatches are matches of +// parenthesized subexpressions within the regular expression, numbered from +// left to right in order of opening parenthesis. Submatch 0 is the match of +// the entire expression, submatch 1 the match of the first parenthesized +// subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+1] identifies the indexes of +// the nth submatch. The pair for n==0 identifies the match of the entire +// expression. If 'Index' is not present, the match is identified by the +// text of the match/submatch. If an index is negative, it means that +// subexpression did not match any string in the input. +// +// There is also a subset of the methods that can be applied to text read +// from a RuneReader: +// +// MatchReader, FindReaderIndex, FindReaderSubmatchIndex +// +// This set may grow. Note that regular expression matches may need to +// examine text beyond the text returned by a match, so the methods that +// match text from a RuneReader may read arbitrarily far into the input +// before returning. +// +// (There are a few other methods that do not match this pattern.) +// +package regexp + +import ( + "bytes" + "exp/regexp/syntax" + "io" + "os" + "strings" + "sync" + "utf8" +) + +var debug = false + +// Error is the local type for a parsing error. +type Error string + +func (e Error) String() string { + return string(e) +} + +// Regexp is the representation of a compiled regular expression. +// The public interface is entirely through methods. +// A Regexp is safe for concurrent use by multiple goroutines. +type Regexp struct { + // read-only after Compile + expr string // as passed to Compile + prog *syntax.Prog // compiled program + prefix string // required prefix in unanchored matches + prefixBytes []byte // prefix, as a []byte + prefixComplete bool // prefix is the entire regexp + prefixRune int // first rune in prefix + cond syntax.EmptyOp // empty-width conditions required at start of match + + // cache of machines for running regexp + mu sync.Mutex + machine []*machine +} + +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.expr +} + +// Compile parses a regular expression and returns, if successful, a Regexp +// object that can be used to match against text. +func Compile(expr string) (*Regexp, os.Error) { + re, err := syntax.Parse(expr, syntax.Perl) + if err != nil { + return nil, err + } + prog, err := syntax.Compile(re) + if err != nil { + return nil, err + } + regexp := &Regexp{ + expr: expr, + prog: prog, + } + regexp.prefix, regexp.prefixComplete = prog.Prefix() + if regexp.prefix != "" { + // TODO(rsc): Remove this allocation by adding + // IndexString to package bytes. + regexp.prefixBytes = []byte(regexp.prefix) + regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) + } + regexp.cond = prog.StartCond() + return regexp, nil +} + +// get returns a machine to use for matching re. +// It uses the re's machine cache if possible, to avoid +// unnecessary allocation. +func (re *Regexp) get() *machine { + re.mu.Lock() + if n := len(re.machine); n > 0 { + z := re.machine[n-1] + re.machine = re.machine[:n-1] + re.mu.Unlock() + return z + } + re.mu.Unlock() + z := progMachine(re.prog) + z.re = re + return z +} + +// put returns a machine to the re's machine cache. +// There is no attempt to limit the size of the cache, so it will +// grow to the maximum number of simultaneous matches +// run using re. (The cache empties when re gets garbage collected.) +func (re *Regexp) put(z *machine) { + re.mu.Lock() + re.machine = append(re.machine, z) + re.mu.Unlock() +} + +// MustCompile is like Compile but panics if the expression cannot be parsed. +// It simplifies safe initialization of global variables holding compiled regular +// expressions. +func MustCompile(str string) *Regexp { + regexp, error := Compile(str) + if error != nil { + panic(`regexp: compiling "` + str + `": ` + error.String()) + } + return regexp +} + +// NumSubexp returns the number of parenthesized subexpressions in this Regexp. +func (re *Regexp) NumSubexp() int { + // NumCap/2 because captures count ( and ) separately. + // -1 because NumCap counts $0 but NumSubexp does not. + return re.prog.NumCap/2 - 1 +} + +const endOfText = -1 + +// input abstracts different representations of the input text. It provides +// one-character lookahead. +type input interface { + step(pos int) (rune int, width int) // advance one rune + canCheckPrefix() bool // can we look ahead without losing info? + hasPrefix(re *Regexp) bool + index(re *Regexp, pos int) int +} + +// inputString scans a string. +type inputString struct { + str string +} + +func newInputString(str string) *inputString { + return &inputString{str: str} +} + +func (i *inputString) step(pos int) (int, int) { + if pos < len(i.str) { + return utf8.DecodeRuneInString(i.str[pos:len(i.str)]) + } + return endOfText, 0 +} + +func (i *inputString) canCheckPrefix() bool { + return true +} + +func (i *inputString) hasPrefix(re *Regexp) bool { + return strings.HasPrefix(i.str, re.prefix) +} + +func (i *inputString) index(re *Regexp, pos int) int { + return strings.Index(i.str[pos:], re.prefix) +} + +// inputBytes scans a byte slice. +type inputBytes struct { + str []byte +} + +func newInputBytes(str []byte) *inputBytes { + return &inputBytes{str: str} +} + +func (i *inputBytes) step(pos int) (int, int) { + if pos < len(i.str) { + return utf8.DecodeRune(i.str[pos:len(i.str)]) + } + return endOfText, 0 +} + +func (i *inputBytes) canCheckPrefix() bool { + return true +} + +func (i *inputBytes) hasPrefix(re *Regexp) bool { + return bytes.HasPrefix(i.str, re.prefixBytes) +} + +func (i *inputBytes) index(re *Regexp, pos int) int { + return bytes.Index(i.str[pos:], re.prefixBytes) +} + +// inputReader scans a RuneReader. +type inputReader struct { + r io.RuneReader + atEOT bool + pos int +} + +func newInputReader(r io.RuneReader) *inputReader { + return &inputReader{r: r} +} + +func (i *inputReader) step(pos int) (int, int) { + if !i.atEOT && pos != i.pos { + return endOfText, 0 + + } + r, w, err := i.r.ReadRune() + if err != nil { + i.atEOT = true + return endOfText, 0 + } + i.pos += w + return r, w +} + +func (i *inputReader) canCheckPrefix() bool { + return false +} + +func (i *inputReader) hasPrefix(re *Regexp) bool { + return false +} + +func (i *inputReader) index(re *Regexp, pos int) int { + return -1 +} + +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re. It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { + return re.prefix, re.prefixComplete +} + +// MatchReader returns whether the Regexp matches the text read by the +// RuneReader. The return value is a boolean: true for match, false for no +// match. +func (re *Regexp) MatchReader(r io.RuneReader) bool { + return re.doExecute(newInputReader(r), 0, 0) != nil +} + +// MatchString returns whether the Regexp matches the string s. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) MatchString(s string) bool { + return re.doExecute(newInputString(s), 0, 0) != nil +} + +// Match returns whether the Regexp matches the byte slice b. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) Match(b []byte) bool { + return re.doExecute(newInputBytes(b), 0, 0) != nil +} + +// MatchReader checks whether a textual regular expression matches the text +// read by the RuneReader. More complicated queries need to use Compile and +// the full Regexp interface. +func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchReader(r), nil +} + +// MatchString checks whether a textual regular expression +// matches a string. More complicated queries need +// to use Compile and the full Regexp interface. +func MatchString(pattern string, s string) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchString(s), nil +} + +// Match checks whether a textual regular expression +// matches a byte slice. More complicated queries need +// to use Compile and the full Regexp interface. +func Match(pattern string, b []byte) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.Match(b), nil +} + +// ReplaceAllString returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllString(src, repl string) string { + return re.ReplaceAllStringFunc(src, func(string) string { return repl }) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched string). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute(newInputString(src), searchPos, 2) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + io.WriteString(buf, src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + io.WriteString(buf, repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRuneInString(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + io.WriteString(buf, src[lastMatchEnd:]) + + return buf.String() +} + +// ReplaceAll returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement text. +func (re *Regexp) ReplaceAll(src, repl []byte) []byte { + return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) +} + +// ReplaceAllFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched []byte). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute(newInputBytes(src), searchPos, 2) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + buf.Write(src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + buf.Write(repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRune(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + buf.Write(src[lastMatchEnd:]) + + return buf.Bytes() +} + +var specialBytes = []byte(`\.+*?()|[]{}^$`) + +func special(b byte) bool { + return bytes.IndexByte(specialBytes, b) >= 0 +} + +// QuoteMeta returns a string that quotes all regular expression metacharacters +// inside the argument text; the returned string is a regular expression matching +// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. +func QuoteMeta(s string) string { + b := make([]byte, 2*len(s)) + + // A byte loop is correct because all metacharacters are ASCII. + j := 0 + for i := 0; i < len(s); i++ { + if special(s[i]) { + b[j] = '\\' + j++ + } + b[j] = s[i] + j++ + } + return string(b[0:j]) +} + +// Find matches in slice b if b is non-nil, otherwise find matches in string s. +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { + var end int + if b == nil { + end = len(s) + } else { + end = len(b) + } + + for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { + var in input + if b == nil { + in = newInputString(s) + } else { + in = newInputBytes(b) + } + matches := re.doExecute(in, pos, re.prog.NumCap) + if len(matches) == 0 { + break + } + + accept := true + if matches[1] == pos { + // We've found an empty match. + if matches[0] == prevMatchEnd { + // We don't allow an empty match right + // after a previous match, so ignore it. + accept = false + } + var width int + // TODO: use step() + if b == nil { + _, width = utf8.DecodeRuneInString(s[pos:end]) + } else { + _, width = utf8.DecodeRune(b[pos:end]) + } + if width > 0 { + pos += width + } else { + pos = end + 1 + } + } else { + pos = matches[1] + } + prevMatchEnd = matches[1] + + if accept { + deliver(matches) + i++ + } + } +} + +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { + a := re.doExecute(newInputBytes(b), 0, 2) + if a == nil { + return nil + } + return b[a[0]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression. The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { + a := re.doExecute(newInputBytes(b), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression. If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string. Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { + a := re.doExecute(newInputString(s), 0, 2) + if a == nil { + return "" + } + return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression. The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) []int { + a := re.doExecute(newInputString(s), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindReaderIndex returns a two-element slice of integers defining the +// location of the leftmost match of the regular expression in text read from +// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return +// value of nil indicates no match. +func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { + a := re.doExecute(newInputReader(r), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { + a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap) + if a == nil { + return nil + } + ret := make([][]byte, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = b[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { + return re.doExecute(newInputBytes(b), 0, re.prog.NumCap) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { + a := re.doExecute(newInputString(s), 0, re.prog.NumCap) + if a == nil { + return nil + } + ret := make([]string, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = s[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { + return re.doExecute(newInputString(s), 0, re.prog.NumCap) +} + +// FindReaderSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression of text read by +// the RuneReader, and the matches, if any, of its subexpressions, as defined +// by the 'Submatch' and 'Index' descriptions in the package comment. A +// return value of nil indicates no match. +func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { + return re.doExecute(newInputReader(r), 0, re.prog.NumCap) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, b[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { + if n < 0 { + n = len(s) + 1 + } + result := make([]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, s[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + slice := make([][]byte, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = b[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { + if n < 0 { + n = len(s) + 1 + } + result := make([][]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + slice := make([]string, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = s[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} diff --git a/libgo/go/exp/regexp/syntax/compile.go b/libgo/go/exp/regexp/syntax/compile.go new file mode 100644 index 0000000..5ea2425 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/compile.go @@ -0,0 +1,269 @@ +package syntax + +import ( + "os" + "unicode" +) + +// A patchList is a list of instruction pointers that need to be filled in (patched). +// Because the pointers haven't been filled in yet, we can reuse their storage +// to hold the list. It's kind of sleazy, but works well in practice. +// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// These aren't really pointers: they're integers, so we can reinterpret them +// this way without using package unsafe. A value l denotes +// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). +// l == 0 denotes the empty list, okay because we start every program +// with a fail instruction, so we'll never want to point at its output link. +type patchList uint32 + +func (l patchList) next(p *Prog) patchList { + i := &p.Inst[l>>1] + if l&1 == 0 { + return patchList(i.Out) + } + return patchList(i.Arg) +} + +func (l patchList) patch(p *Prog, val uint32) { + for l != 0 { + i := &p.Inst[l>>1] + if l&1 == 0 { + l = patchList(i.Out) + i.Out = val + } else { + l = patchList(i.Arg) + i.Arg = val + } + } +} + +func (l1 patchList) append(p *Prog, l2 patchList) patchList { + if l1 == 0 { + return l2 + } + if l2 == 0 { + return l1 + } + + last := l1 + for { + next := last.next(p) + if next == 0 { + break + } + last = next + } + + i := &p.Inst[last>>1] + if last&1 == 0 { + i.Out = uint32(l2) + } else { + i.Arg = uint32(l2) + } + return l1 +} + +// A frag represents a compiled program fragment. +type frag struct { + i uint32 // index of first instruction + out patchList // where to record end instruction +} + +type compiler struct { + p *Prog +} + +// Compile compiles the regexp into a program to be executed. +func Compile(re *Regexp) (*Prog, os.Error) { + var c compiler + c.init() + f := c.compile(re) + f.out.patch(c.p, c.inst(InstMatch).i) + c.p.Start = int(f.i) + return c.p, nil +} + +func (c *compiler) init() { + c.p = new(Prog) + c.p.NumCap = 2 // implicit ( and ) for whole match $0 + c.inst(InstFail) +} + +var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune} +var anyRune = []int{0, unicode.MaxRune} + +func (c *compiler) compile(re *Regexp) frag { + switch re.Op { + case OpNoMatch: + return c.fail() + case OpEmptyMatch: + return c.nop() + case OpLiteral: + if len(re.Rune) == 0 { + return c.nop() + } + var f frag + for j := range re.Rune { + f1 := c.rune(re.Rune[j : j+1]) + if j == 0 { + f = f1 + } else { + f = c.cat(f, f1) + } + } + return f + case OpCharClass: + return c.rune(re.Rune) + case OpAnyCharNotNL: + return c.rune(anyRuneNotNL) + case OpAnyChar: + return c.rune(anyRune) + case OpBeginLine: + return c.empty(EmptyBeginLine) + case OpEndLine: + return c.empty(EmptyEndLine) + case OpBeginText: + return c.empty(EmptyBeginText) + case OpEndText: + return c.empty(EmptyEndText) + case OpWordBoundary: + return c.empty(EmptyWordBoundary) + case OpNoWordBoundary: + return c.empty(EmptyNoWordBoundary) + case OpCapture: + bra := c.cap(uint32(re.Cap << 1)) + sub := c.compile(re.Sub[0]) + ket := c.cap(uint32(re.Cap<<1 | 1)) + return c.cat(c.cat(bra, sub), ket) + case OpStar: + return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpPlus: + return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpQuest: + return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpConcat: + if len(re.Sub) == 0 { + return c.nop() + } + var f frag + for i, sub := range re.Sub { + if i == 0 { + f = c.compile(sub) + } else { + f = c.cat(f, c.compile(sub)) + } + } + return f + case OpAlternate: + var f frag + for _, sub := range re.Sub { + f = c.alt(f, c.compile(sub)) + } + return f + } + panic("regexp: unhandled case in compile") +} + +func (c *compiler) inst(op InstOp) frag { + // TODO: impose length limit + f := frag{i: uint32(len(c.p.Inst))} + c.p.Inst = append(c.p.Inst, Inst{Op: op}) + return f +} + +func (c *compiler) nop() frag { + f := c.inst(InstNop) + f.out = patchList(f.i << 1) + return f +} + +func (c *compiler) fail() frag { + return frag{} +} + +func (c *compiler) cap(arg uint32) frag { + f := c.inst(InstCapture) + f.out = patchList(f.i << 1) + c.p.Inst[f.i].Arg = arg + + if c.p.NumCap < int(arg)+1 { + c.p.NumCap = int(arg) + 1 + } + return f +} + +func (c *compiler) cat(f1, f2 frag) frag { + // concat of failure is failure + if f1.i == 0 || f2.i == 0 { + return frag{} + } + + // TODO: elide nop + + f1.out.patch(c.p, f2.i) + return frag{f1.i, f2.out} +} + +func (c *compiler) alt(f1, f2 frag) frag { + // alt of failure is other + if f1.i == 0 { + return f2 + } + if f2.i == 0 { + return f1 + } + + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + i.Out = f1.i + i.Arg = f2.i + f.out = f1.out.append(c.p, f2.out) + return f +} + +func (c *compiler) quest(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = patchList(f.i << 1) + } else { + i.Out = f1.i + f.out = patchList(f.i<<1 | 1) + } + f.out = f.out.append(c.p, f1.out) + return f +} + +func (c *compiler) star(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = patchList(f.i << 1) + } else { + i.Out = f1.i + f.out = patchList(f.i<<1 | 1) + } + f1.out.patch(c.p, f.i) + return f +} + +func (c *compiler) plus(f1 frag, nongreedy bool) frag { + return frag{f1.i, c.star(f1, nongreedy).out} +} + +func (c *compiler) empty(op EmptyOp) frag { + f := c.inst(InstEmptyWidth) + c.p.Inst[f.i].Arg = uint32(op) + f.out = patchList(f.i << 1) + return f +} + +func (c *compiler) rune(rune []int) frag { + f := c.inst(InstRune) + c.p.Inst[f.i].Rune = rune + f.out = patchList(f.i << 1) + return f +} diff --git a/libgo/go/exp/regexp/syntax/parse.go b/libgo/go/exp/regexp/syntax/parse.go new file mode 100644 index 0000000..4eed182 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/parse.go @@ -0,0 +1,1797 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "os" + "sort" + "strings" + "unicode" + "utf8" +) + +// An Error describes a failure to parse a regular expression +// and gives the offending expression. +type Error struct { + Code ErrorCode + Expr string +} + +func (e *Error) String() string { + return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" +} + +// An ErrorCode describes a failure to parse a regular expression. +type ErrorCode string + +const ( + // Unexpected error + ErrInternalError ErrorCode = "regexp/syntax: internal error" + + // Parse errors + ErrInvalidCharClass ErrorCode = "invalid character class" + ErrInvalidCharRange ErrorCode = "invalid character class range" + ErrInvalidEscape ErrorCode = "invalid escape sequence" + ErrInvalidNamedCapture ErrorCode = "invalid named capture" + ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" + ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" + ErrInvalidRepeatSize ErrorCode = "invalid repeat count" + ErrInvalidUTF8 ErrorCode = "invalid UTF-8" + ErrMissingBracket ErrorCode = "missing closing ]" + ErrMissingParen ErrorCode = "missing closing )" + ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" + ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" +) + +func (e ErrorCode) String() string { + return string(e) +} + +// Flags control the behavior of the parser and record information about regexp context. +type Flags uint16 + +const ( + FoldCase Flags = 1 << iota // case-insensitive match + Literal // treat pattern as literal string + ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline + DotNL // allow . to match newline + OneLine // treat ^ and $ as only matching at beginning and end of text + NonGreedy // make repetition operators default to non-greedy + PerlX // allow Perl extensions + UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation + WasDollar // regexp OpEndText was $, not \z + Simple // regexp contains no counted repetition + + MatchNL = ClassNL | DotNL + + Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible + POSIX Flags = 0 // POSIX syntax +) + +// Pseudo-ops for parsing stack. +const ( + opLeftParen = opPseudo + iota + opVerticalBar +) + +type parser struct { + flags Flags // parse mode flags + stack []*Regexp // stack of parsed expressions + free *Regexp + numCap int // number of capturing groups seen + wholeRegexp string + tmpClass []int // temporary char class work space +} + +func (p *parser) newRegexp(op Op) *Regexp { + re := p.free + if re != nil { + p.free = re.Sub0[0] + *re = Regexp{} + } else { + re = new(Regexp) + } + re.Op = op + return re +} + +func (p *parser) reuse(re *Regexp) { + re.Sub0[0] = p.free + p.free = re +} + +// Parse stack manipulation. + +// push pushes the regexp re onto the parse stack and returns the regexp. +func (p *parser) push(re *Regexp) *Regexp { + if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { + // Single rune. + if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { + return nil + } + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags &^ FoldCase + } else if re.Op == OpCharClass && len(re.Rune) == 4 && + re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && + unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || + re.Op == OpCharClass && len(re.Rune) == 2 && + re.Rune[0]+1 == re.Rune[1] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && + unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { + // Case-insensitive rune like [Aa] or [Δδ]. + if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { + return nil + } + + // Rewrite as (case-insensitive) literal. + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags | FoldCase + } else { + // Incremental concatenation. + p.maybeConcat(-1, 0) + } + + p.stack = append(p.stack, re) + return re +} + +// maybeConcat implements incremental concatenation +// of literal runes into string nodes. The parser calls this +// before each push, so only the top fragment of the stack +// might need processing. Since this is called before a push, +// the topmost literal is no longer subject to operators like * +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0 and there's a node left over, maybeConcat uses it +// to push r with the given flags. +// maybeConcat reports whether r was pushed. +func (p *parser) maybeConcat(r int, flags Flags) bool { + n := len(p.stack) + if n < 2 { + return false + } + + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { + return false + } + + // Push re1 into re2. + re2.Rune = append(re2.Rune, re1.Rune...) + + // Reuse re1 if possible. + if r >= 0 { + re1.Rune = re1.Rune0[:1] + re1.Rune[0] = r + re1.Flags = flags + return true + } + + p.stack = p.stack[:n-1] + p.reuse(re1) + return false // did not push r +} + +// newLiteral returns a new OpLiteral Regexp with the given flags +func (p *parser) newLiteral(r int, flags Flags) *Regexp { + re := p.newRegexp(OpLiteral) + re.Flags = flags + re.Rune0[0] = r + re.Rune = re.Rune0[:1] + return re +} + +// literal pushes a literal regexp for the rune r on the stack +// and returns that regexp. +func (p *parser) literal(r int) { + p.push(p.newLiteral(r, p.flags)) +} + +// op pushes a regexp with the given op onto the stack +// and returns that regexp. +func (p *parser) op(op Op) *Regexp { + re := p.newRegexp(op) + re.Flags = p.flags + return p.push(re) +} + +// repeat replaces the top stack element with itself repeated +// according to op. +func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (string, os.Error) { + flags := p.flags + if p.flags&PerlX != 0 { + if len(t) > 0 && t[0] == '?' { + t = t[1:] + flags ^= NonGreedy + } + if lastRepeat != "" { + // In Perl it is not allowed to stack repetition operators: + // a** is a syntax error, not a doubled star, and a++ means + // something else entirely, which we don't support! + return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]} + } + } + n := len(p.stack) + if n == 0 { + return "", &Error{ErrMissingRepeatArgument, opstr} + } + sub := p.stack[n-1] + re := p.newRegexp(op) + re.Min = min + re.Max = max + re.Flags = flags + re.Sub = re.Sub0[:1] + re.Sub[0] = sub + p.stack[n-1] = re + return t, nil +} + +// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. +func (p *parser) concat() *Regexp { + p.maybeConcat(-1, 0) + + // Scan down to find pseudo-operator | or (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Empty concatenation is special case. + if len(subs) == 0 { + return p.push(p.newRegexp(OpEmptyMatch)) + } + + return p.push(p.collapse(subs, OpConcat)) +} + +// alternate replaces the top of the stack (above the topmost '(') with its alternation. +func (p *parser) alternate() *Regexp { + // Scan down to find pseudo-operator (. + // There are no | above (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Make sure top class is clean. + // All the others already are (see swapVerticalBar). + if len(subs) > 0 { + cleanAlt(subs[len(subs)-1]) + } + + // Empty alternate is special case + // (shouldn't happen but easy to handle). + if len(subs) == 0 { + return p.push(p.newRegexp(OpNoMatch)) + } + + return p.push(p.collapse(subs, OpAlternate)) +} + +// cleanAlt cleans re for eventual inclusion in an alternation. +func cleanAlt(re *Regexp) { + switch re.Op { + case OpCharClass: + re.Rune = cleanClass(&re.Rune) + if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyChar + return + } + if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyCharNotNL + return + } + if cap(re.Rune)-len(re.Rune) > 100 { + // re.Rune will not grow any more. + // Make a copy or inline to reclaim storage. + re.Rune = append(re.Rune0[:0], re.Rune...) + } + } +} + +// collapse returns the result of applying op to sub. +// If sub contains op nodes, they all get hoisted up +// so that there is never a concat of a concat or an +// alternate of an alternate. +func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { + if len(subs) == 1 { + return subs[0] + } + re := p.newRegexp(op) + re.Sub = re.Sub0[:0] + for _, sub := range subs { + if sub.Op == op { + re.Sub = append(re.Sub, sub.Sub...) + p.reuse(sub) + } else { + re.Sub = append(re.Sub, sub) + } + } + if op == OpAlternate { + re.Sub = p.factor(re.Sub, re.Flags) + if len(re.Sub) == 1 { + old := re + re = re.Sub[0] + p.reuse(old) + } + } + return re +} + +// factor factors common prefixes from the alternation list sub. +// It returns a replacement list that reuses the same storage and +// frees (passes to p.reuse) any removed *Regexps. +// +// For example, +// ABC|ABD|AEF|BCX|BCY +// simplifies by literal prefix extraction to +// A(B(C|D)|EF)|BC(X|Y) +// which simplifies by character class introduction to +// A(B[CD]|EF)|BC[XY] +// +func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { + if len(sub) < 2 { + return sub + } + + // Round 1: Factor out common literal prefixes. + var str []int + var strflags Flags + start := 0 + out := sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var istr []int + var iflags Flags + if i < len(sub) { + istr, iflags = p.leadingString(sub[i]) + if iflags == strflags { + same := 0 + for same < len(str) && same < len(istr) && str[same] == istr[same] { + same++ + } + if same > 0 { + // Matches at least one rune in current range. + // Keep going around. + str = str[:same] + continue + } + } + } + + // Found end of a run with common leading literal string: + // sub[start:i] all begin with str[0:len(str)], but sub[i] + // does not even begin with str[0]. + // + // Factor out common string and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := p.newRegexp(OpLiteral) + prefix.Flags = strflags + prefix.Rune = append(prefix.Rune[:0], str...) + + for j := start; j < i; j++ { + sub[j] = p.removeLeadingString(sub[j], len(str)) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + str = istr + strflags = iflags + } + sub = out + + // Round 2: Factor out common complex prefixes, + // just the first piece of each concatenation, + // whatever it is. This is good enough a lot of the time. + start = 0 + out = sub[:0] + var first *Regexp + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var ifirst *Regexp + if i < len(sub) { + ifirst = p.leadingRegexp(sub[i]) + if first != nil && first.Equal(ifirst) { + continue + } + } + + // Found end of a run with common leading regexp: + // sub[start:i] all begin with first but sub[i] does not. + // + // Factor out common regexp and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := first + + for j := start; j < i; j++ { + reuse := j != start // prefix came from sub[start] + sub[j] = p.removeLeadingRegexp(sub[j], reuse) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + first = ifirst + } + sub = out + + // Round 3: Collapse runs of single literals into character classes. + start = 0 + out = sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that are either + // literal runes or character classes. + if i < len(sub) && isCharClass(sub[i]) { + continue + } + + // sub[i] is not a char or char class; + // emit char class for sub[start:i]... + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + out = append(out, sub[start]) + } else { + // Make new char class. + // Start with most complex regexp in sub[start]. + max := start + for j := start + 1; j < i; j++ { + if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { + max = j + } + } + sub[start], sub[max] = sub[max], sub[start] + + for j := start + 1; j < i; j++ { + mergeCharClass(sub[start], sub[j]) + p.reuse(sub[j]) + } + cleanAlt(sub[start]) + out = append(out, sub[start]) + } + + // ... and then emit sub[i]. + if i < len(sub) { + out = append(out, sub[i]) + } + start = i + 1 + } + sub = out + + // Round 4: Collapse runs of empty matches into a single empty match. + start = 0 + out = sub[:0] + for i := range sub { + if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { + continue + } + out = append(out, sub[i]) + } + sub = out + + return sub +} + +// leadingString returns the leading literal string that re begins with. +// The string refers to storage in re or its children. +func (p *parser) leadingString(re *Regexp) ([]int, Flags) { + if re.Op == OpConcat && len(re.Sub) > 0 { + re = re.Sub[0] + } + if re.Op != OpLiteral { + return nil, 0 + } + return re.Rune, re.Flags & FoldCase +} + +// removeLeadingString removes the first n leading runes +// from the beginning of re. It returns the replacement for re. +func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + // Removing a leading string in a concatenation + // might simplify the concatenation. + sub := re.Sub[0] + sub = p.removeLeadingString(sub, n) + re.Sub[0] = sub + if sub.Op == OpEmptyMatch { + p.reuse(sub) + switch len(re.Sub) { + case 0, 1: + // Impossible but handle. + re.Op = OpEmptyMatch + re.Sub = nil + case 2: + old := re + re = re.Sub[1] + p.reuse(old) + default: + copy(re.Sub, re.Sub[1:]) + re.Sub = re.Sub[:len(re.Sub)-1] + } + } + return re + } + + if re.Op == OpLiteral { + re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] + if len(re.Rune) == 0 { + re.Op = OpEmptyMatch + } + } + return re +} + +// leadingRegexp returns the leading regexp that re begins with. +// The regexp refers to storage in re or its children. +func (p *parser) leadingRegexp(re *Regexp) *Regexp { + if re.Op == OpEmptyMatch { + return nil + } + if re.Op == OpConcat && len(re.Sub) > 0 { + sub := re.Sub[0] + if sub.Op == OpEmptyMatch { + return nil + } + return sub + } + return re +} + +// removeLeadingRegexp removes the leading regexp in re. +// It returns the replacement for re. +// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. +func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + if reuse { + p.reuse(re.Sub[0]) + } + re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] + switch len(re.Sub) { + case 0: + re.Op = OpEmptyMatch + re.Sub = nil + case 1: + old := re + re = re.Sub[0] + p.reuse(old) + } + return re + } + re.Op = OpEmptyMatch + return re +} + +func literalRegexp(s string, flags Flags) *Regexp { + re := &Regexp{Op: OpLiteral} + re.Flags = flags + re.Rune = re.Rune0[:0] // use local storage for small strings + for _, c := range s { + if len(re.Rune) >= cap(re.Rune) { + // string is too long to fit in Rune0. let Go handle it + re.Rune = []int(s) + break + } + re.Rune = append(re.Rune, c) + } + return re +} + +// Parsing. + +func Parse(s string, flags Flags) (*Regexp, os.Error) { + if flags&Literal != 0 { + // Trivial parser for literal string. + if err := checkUTF8(s); err != nil { + return nil, err + } + return literalRegexp(s, flags), nil + } + + // Otherwise, must do real work. + var ( + p parser + err os.Error + c int + op Op + lastRepeat string + min, max int + ) + p.flags = flags + p.wholeRegexp = s + t := s + for t != "" { + repeat := "" + BigSwitch: + switch t[0] { + default: + if c, t, err = nextRune(t); err != nil { + return nil, err + } + p.literal(c) + + case '(': + if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { + // Flag changes and non-capturing groups. + if t, err = p.parsePerlFlags(t); err != nil { + return nil, err + } + break + } + p.numCap++ + p.op(opLeftParen).Cap = p.numCap + t = t[1:] + case '|': + if err = p.parseVerticalBar(); err != nil { + return nil, err + } + t = t[1:] + case ')': + if err = p.parseRightParen(); err != nil { + return nil, err + } + t = t[1:] + case '^': + if p.flags&OneLine != 0 { + p.op(OpBeginText) + } else { + p.op(OpBeginLine) + } + t = t[1:] + case '$': + if p.flags&OneLine != 0 { + p.op(OpEndText).Flags |= WasDollar + } else { + p.op(OpEndLine) + } + t = t[1:] + case '.': + if p.flags&DotNL != 0 { + p.op(OpAnyChar) + } else { + p.op(OpAnyCharNotNL) + } + t = t[1:] + case '[': + if t, err = p.parseClass(t); err != nil { + return nil, err + } + case '*', '+', '?': + switch t[0] { + case '*': + op = OpStar + case '+': + op = OpPlus + case '?': + op = OpQuest + } + if t, err = p.repeat(op, min, max, t[:1], t[1:], lastRepeat); err != nil { + return nil, err + } + case '{': + op = OpRepeat + min, max, tt, ok := p.parseRepeat(t) + if !ok { + // If the repeat cannot be parsed, { is a literal. + p.literal('{') + t = t[1:] + break + } + if t, err = p.repeat(op, min, max, t[:len(t)-len(tt)], tt, lastRepeat); err != nil { + return nil, err + } + case '\\': + if p.flags&PerlX != 0 && len(t) >= 2 { + switch t[1] { + case 'A': + p.op(OpBeginText) + t = t[2:] + break BigSwitch + case 'b': + p.op(OpWordBoundary) + t = t[2:] + break BigSwitch + case 'B': + p.op(OpNoWordBoundary) + t = t[2:] + break BigSwitch + case 'C': + // any byte; not supported + return nil, &Error{ErrInvalidEscape, t[:2]} + case 'Q': + // \Q ... \E: the ... is always literals + var lit string + if i := strings.Index(t, `\E`); i < 0 { + lit = t[2:] + t = "" + } else { + lit = t[2:i] + t = t[i+2:] + } + p.push(literalRegexp(lit, p.flags)) + break BigSwitch + case 'z': + p.op(OpEndText) + t = t[2:] + break BigSwitch + } + } + + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + + // Look for Unicode character group like \p{Han} + if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { + r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) + if err != nil { + return nil, err + } + if r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + } + + // Perl character class escape. + if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + p.reuse(re) + + // Ordinary single-character escape. + if c, t, err = p.parseEscape(t); err != nil { + return nil, err + } + p.literal(c) + } + lastRepeat = repeat + } + + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n != 1 { + return nil, &Error{ErrMissingParen, s} + } + return p.stack[0], nil +} + +// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. +// If s is not of that form, it returns ok == false. +func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { + if s == "" || s[0] != '{' { + return + } + s = s[1:] + if min, s, ok = p.parseInt(s); !ok { + return + } + if s == "" { + return + } + if s[0] != ',' { + max = min + } else { + s = s[1:] + if s == "" { + return + } + if s[0] == '}' { + max = -1 + } else if max, s, ok = p.parseInt(s); !ok { + return + } + } + if s == "" || s[0] != '}' { + return + } + rest = s[1:] + ok = true + return +} + +// parsePerlFlags parses a Perl flag setting or non-capturing group or both, +// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. +// The caller must have ensured that s begins with "(?". +func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) { + t := s + + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?P<name>expr) the original, introduced by Python + // (?<name>expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?P<expr>name) is the dominant form, + // so that's the one we implement. One is enough. + if len(t) > 4 && t[2] == 'P' && t[3] == '<' { + // Pull out name. + end := strings.IndexRune(t, '>') + if end < 0 { + if err = checkUTF8(t); err != nil { + return "", err + } + return "", &Error{ErrInvalidNamedCapture, s} + } + + capture := t[:end+1] // "(?P<name>" + name := t[4:end] // "name" + if err = checkUTF8(name); err != nil { + return "", err + } + if !isValidCaptureName(name) { + return "", &Error{ErrInvalidNamedCapture, capture} + } + + // Like ordinary capture, but named. + p.numCap++ + re := p.op(opLeftParen) + re.Cap = p.numCap + re.Name = name + return t[end+1:], nil + } + + // Non-capturing group. Might also twiddle Perl flags. + var c int + t = t[2:] // skip (? + flags := p.flags + sign := +1 + sawFlag := false +Loop: + for t != "" { + if c, t, err = nextRune(t); err != nil { + return "", err + } + switch c { + default: + break Loop + + // Flags. + case 'i': + flags |= FoldCase + sawFlag = true + case 'm': + flags &^= OneLine + sawFlag = true + case 's': + flags |= DotNL + sawFlag = true + case 'U': + flags |= NonGreedy + sawFlag = true + + // Switch to negation. + case '-': + if sign < 0 { + break Loop + } + sign = -1 + // Invert flags so that | above turn into &^ and vice versa. + // We'll invert flags again before using it below. + flags = ^flags + sawFlag = false + + // End of flags, starting group or not. + case ':', ')': + if sign < 0 { + if !sawFlag { + break Loop + } + flags = ^flags + } + if c == ':' { + // Open new group + p.op(opLeftParen) + } + p.flags = flags + return t, nil + } + } + + return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} +} + +// isValidCaptureName reports whether name +// is a valid capture name: [A-Za-z0-9_]+. +// PCRE limits names to 32 bytes. +// Python rejects names starting with digits. +// We don't enforce either of those. +func isValidCaptureName(name string) bool { + if name == "" { + return false + } + for _, c := range name { + if c != '_' && !isalnum(c) { + return false + } + } + return true +} + +// parseInt parses a decimal integer. +func (p *parser) parseInt(s string) (n int, rest string, ok bool) { + if s == "" || s[0] < '0' || '9' < s[0] { + return + } + // Disallow leading zeros. + if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { + return + } + for s != "" && '0' <= s[0] && s[0] <= '9' { + // Avoid overflow. + if n >= 1e8 { + return + } + n = n*10 + int(s[0]) - '0' + s = s[1:] + } + rest = s + ok = true + return +} + +// can this be represented as a character class? +// single-rune literal string, char class, ., and .|\n. +func isCharClass(re *Regexp) bool { + return re.Op == OpLiteral && len(re.Rune) == 1 || + re.Op == OpCharClass || + re.Op == OpAnyCharNotNL || + re.Op == OpAnyChar +} + +// does re match r? +func matchRune(re *Regexp, r int) bool { + switch re.Op { + case OpLiteral: + return len(re.Rune) == 1 && re.Rune[0] == r + case OpCharClass: + for i := 0; i < len(re.Rune); i += 2 { + if re.Rune[i] <= r && r <= re.Rune[i+1] { + return true + } + } + return false + case OpAnyCharNotNL: + return r != '\n' + case OpAnyChar: + return true + } + return false +} + +// parseVerticalBar handles a | in the input. +func (p *parser) parseVerticalBar() os.Error { + p.concat() + + // The concatenation we just parsed is on top of the stack. + // If it sits above an opVerticalBar, swap it below + // (things below an opVerticalBar become an alternation). + // Otherwise, push a new vertical bar. + if !p.swapVerticalBar() { + p.op(opVerticalBar) + } + + return nil +} + +// mergeCharClass makes dst = dst|src. +// The caller must ensure that dst.Op >= src.Op, +// to reduce the amount of copying. +func mergeCharClass(dst, src *Regexp) { + switch dst.Op { + case OpAnyChar: + // src doesn't add anything. + case OpAnyCharNotNL: + // src might add \n + if matchRune(src, '\n') { + dst.Op = OpAnyChar + } + case OpCharClass: + // src is simpler, so either literal or char class + if src.Op == OpLiteral { + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } else { + dst.Rune = appendClass(dst.Rune, src.Rune) + } + case OpLiteral: + // both literal + if src.Rune[0] == dst.Rune[0] { + break + } + dst.Op = OpCharClass + dst.Rune = append(dst.Rune, dst.Rune[0]) + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } +} + +// If the top of the stack is an element followed by an opVerticalBar +// swapVerticalBar swaps the two and returns true. +// Otherwise it returns false. +func (p *parser) swapVerticalBar() bool { + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + n := len(p.stack) + if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { + re1 := p.stack[n-1] + re3 := p.stack[n-3] + // Make re3 the more complex of the two. + if re1.Op > re3.Op { + re1, re3 = re3, re1 + p.stack[n-3] = re3 + } + mergeCharClass(re3, re1) + p.reuse(re1) + p.stack = p.stack[:n-1] + return true + } + + if n >= 2 { + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re2.Op == opVerticalBar { + if n >= 3 { + // Now out of reach. + // Clean opportunistically. + cleanAlt(p.stack[n-3]) + } + p.stack[n-2] = re1 + p.stack[n-1] = re2 + return true + } + } + return false +} + +// parseRightParen handles a ) in the input. +func (p *parser) parseRightParen() os.Error { + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n < 2 { + return &Error{ErrInternalError, ""} + } + re1 := p.stack[n-1] + re2 := p.stack[n-2] + p.stack = p.stack[:n-2] + if re2.Op != opLeftParen { + return &Error{ErrMissingParen, p.wholeRegexp} + } + if re2.Cap == 0 { + // Just for grouping. + p.push(re1) + } else { + re2.Op = OpCapture + re2.Sub = re2.Sub0[:1] + re2.Sub[0] = re1 + p.push(re2) + } + return nil +} + +// parseEscape parses an escape sequence at the beginning of s +// and returns the rune. +func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) { + t := s[1:] + if t == "" { + return 0, "", &Error{ErrTrailingBackslash, ""} + } + c, t, err := nextRune(t) + if err != nil { + return 0, "", err + } + +Switch: + switch c { + default: + if c < utf8.RuneSelf && !isalnum(c) { + // Escaped non-word characters are always themselves. + // PCRE is not quite so rigorous: it accepts things like + // \q, but we don't. We once rejected \_, but too many + // programs and people insist on using it, so allow \_. + return c, t, nil + } + + // Octal escapes. + case '1', '2', '3', '4', '5', '6', '7': + // Single non-zero digit is a backreference; not supported + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + fallthrough + case '0': + // Consume up to three octal digits; already have one. + r = c - '0' + for i := 1; i < 3; i++ { + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + r = r*8 + int(t[0]) - '0' + t = t[1:] + } + return r, t, nil + + // Hexadecimal escapes. + case 'x': + if t == "" { + break + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '{' { + // Any number of digits in braces. + // Perl accepts any text at all; it ignores all text + // after the first non-hex digit. We require only hex digits, + // and at least one. + nhex := 0 + r = 0 + for { + if t == "" { + break Switch + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '}' { + break + } + v := unhex(c) + if v < 0 { + break Switch + } + r = r*16 + v + if r > unicode.MaxRune { + break Switch + } + nhex++ + } + if nhex == 0 { + break Switch + } + return r, t, nil + } + + // Easy case: two hex digits. + x := unhex(c) + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + y := unhex(c) + if x < 0 || y < 0 { + break + } + return x*16 + y, t, nil + + // C escapes. There is no case 'b', to avoid misparsing + // the Perl word-boundary \b as the C backspace \b + // when in POSIX mode. In Perl, /\b/ means word-boundary + // but /[\b]/ means backspace. We don't support that. + // If you want a backspace, embed a literal backspace + // character or use \x08. + case 'a': + return '\a', t, err + case 'f': + return '\f', t, err + case 'n': + return '\n', t, err + case 'r': + return '\r', t, err + case 't': + return '\t', t, err + case 'v': + return '\v', t, err + } + return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} +} + +// parseClassChar parses a character class character at the beginning of s +// and returns it. +func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) { + if s == "" { + return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} + } + + // Allow regular escape sequences even though + // many need not be escaped in this context. + if s[0] == '\\' { + return p.parseEscape(s) + } + + return nextRune(s) +} + +type charGroup struct { + sign int + class []int +} + +// parsePerlClassEscape parses a leading Perl character class escape like \d +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) { + if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { + return + } + g := perlGroup[s[0:2]] + if g.sign == 0 { + return + } + return p.appendGroup(r, g), s[2:] +} + +// parseNamedClass parses a leading POSIX named character class like [:alnum:] +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) { + if len(s) < 2 || s[0] != '[' || s[1] != ':' { + return + } + + i := strings.Index(s[2:], ":]") + if i < 0 { + return + } + i += 2 + name, s := s[0:i+2], s[i+2:] + g := posixGroup[name] + if g.sign == 0 { + return nil, "", &Error{ErrInvalidCharRange, name} + } + return p.appendGroup(r, g), s, nil +} + +func (p *parser) appendGroup(r []int, g charGroup) []int { + if p.flags&FoldCase == 0 { + if g.sign < 0 { + r = appendNegatedClass(r, g.class) + } else { + r = appendClass(r, g.class) + } + } else { + tmp := p.tmpClass[:0] + tmp = appendFoldedClass(tmp, g.class) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if g.sign < 0 { + r = appendNegatedClass(r, tmp) + } else { + r = appendClass(r, tmp) + } + } + return r +} + +// unicodeTable returns the unicode.RangeTable identified by name +// and the table of additional fold-equivalent code points. +func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { + if t := unicode.Categories[name]; t != nil { + return t, unicode.FoldCategory[name] + } + if t := unicode.Scripts[name]; t != nil { + return t, unicode.FoldScript[name] + } + return nil, nil +} + +// parseUnicodeClass parses a leading Unicode character class like \p{Han} +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) { + if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { + return + } + + // Committed to parse or return error. + sign := +1 + if s[1] == 'P' { + sign = -1 + } + t := s[2:] + c, t, err := nextRune(t) + if err != nil { + return + } + var seq, name string + if c != '{' { + // Single-letter name. + seq = s[:len(s)-len(t)] + name = seq[2:] + } else { + // Name is in braces. + end := strings.IndexRune(s, '}') + if end < 0 { + if err = checkUTF8(s); err != nil { + return + } + return nil, "", &Error{ErrInvalidCharRange, s} + } + seq, t = s[:end+1], s[end+1:] + name = s[3:end] + if err = checkUTF8(name); err != nil { + return + } + } + + // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. + if name != "" && name[0] == '^' { + sign = -sign + name = name[1:] + } + + tab, fold := unicodeTable(name) + if tab == nil { + return nil, "", &Error{ErrInvalidCharRange, seq} + } + + if p.flags&FoldCase == 0 || fold == nil { + if sign > 0 { + r = appendTable(r, tab) + } else { + r = appendNegatedTable(r, tab) + } + } else { + // Merge and clean tab and fold in a temporary buffer. + // This is necessary for the negative case and just tidy + // for the positive case. + tmp := p.tmpClass[:0] + tmp = appendTable(tmp, tab) + tmp = appendTable(tmp, fold) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if sign > 0 { + r = appendClass(r, tmp) + } else { + r = appendNegatedClass(r, tmp) + } + } + return r, t, nil +} + +// parseClass parses a character class at the beginning of s +// and pushes it onto the parse stack. +func (p *parser) parseClass(s string) (rest string, err os.Error) { + t := s[1:] // chop [ + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + re.Rune = re.Rune0[:0] + + sign := +1 + if t != "" && t[0] == '^' { + sign = -1 + t = t[1:] + + // If character class does not match \n, add it here, + // so that negation later will do the right thing. + if p.flags&ClassNL == 0 { + re.Rune = append(re.Rune, '\n', '\n') + } + } + + class := re.Rune + first := true // ] and - are okay as first char in class + for t == "" || t[0] != ']' || first { + // POSIX: - is only okay unescaped as first or last in class. + // Perl: - is okay anywhere. + if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { + _, size := utf8.DecodeRuneInString(t[1:]) + return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} + } + first = false + + // Look for POSIX [:alnum:] etc. + if len(t) > 2 && t[0] == '[' && t[1] == ':' { + nclass, nt, err := p.parseNamedClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + } + + // Look for Unicode character group like \p{Han}. + nclass, nt, err := p.parseUnicodeClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + + // Look for Perl character class symbols (extension). + if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { + class, t = nclass, nt + continue + } + + // Single character or simple range. + rng := t + var lo, hi int + if lo, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + hi = lo + // [a-] means (a|-) so check for final ]. + if len(t) >= 2 && t[0] == '-' && t[1] != ']' { + t = t[1:] + if hi, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + if hi < lo { + rng = rng[:len(rng)-len(t)] + return "", &Error{Code: ErrInvalidCharRange, Expr: rng} + } + } + if p.flags&FoldCase == 0 { + class = appendRange(class, lo, hi) + } else { + class = appendFoldedRange(class, lo, hi) + } + } + t = t[1:] // chop ] + + // Use &re.Rune instead of &class to avoid allocation. + re.Rune = class + class = cleanClass(&re.Rune) + if sign < 0 { + class = negateClass(class) + } + re.Rune = class + p.push(re) + return t, nil +} + +// cleanClass sorts the ranges (pairs of elements of r), +// merges them, and eliminates duplicates. +func cleanClass(rp *[]int) []int { + + // Sort by lo increasing, hi decreasing to break ties. + sort.Sort(ranges{rp}) + + r := *rp + if len(r) < 2 { + return r + } + + // Merge abutting, overlapping. + w := 2 // write index + for i := 2; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if lo <= r[w-1]+1 { + // merge with previous range + if hi > r[w-1] { + r[w-1] = hi + } + continue + } + // new disjoint range + r[w] = lo + r[w+1] = hi + w += 2 + } + + return r[:w] +} + +// appendRange returns the result of appending the range lo-hi to the class r. +func appendRange(r []int, lo, hi int) []int { + // Expand last range or next to last range if it overlaps or abuts. + // Checking two ranges helps when appending case-folded + // alphabets, so that one range can be expanding A-Z and the + // other expanding a-z. + n := len(r) + for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 + if n >= i { + rlo, rhi := r[n-i], r[n-i+1] + if lo <= rhi+1 && rlo <= hi+1 { + if lo < rlo { + r[n-i] = lo + } + if hi > rhi { + r[n-i+1] = hi + } + return r + } + } + } + + return append(r, lo, hi) +} + +const ( + // minimum and maximum runes involved in folding. + // checked during test. + minFold = 0x0041 + maxFold = 0x1044f +) + +// appendFoldedRange returns the result of appending the range lo-hi +// and its case folding-equivalent runes to the class r. +func appendFoldedRange(r []int, lo, hi int) []int { + // Optimizations. + if lo <= minFold && hi >= maxFold { + // Range is full: folding can't add more. + return appendRange(r, lo, hi) + } + if hi < minFold || lo > maxFold { + // Range is outside folding possibilities. + return appendRange(r, lo, hi) + } + if lo < minFold { + // [lo, minFold-1] needs no folding. + r = appendRange(r, lo, minFold-1) + lo = minFold + } + if hi > maxFold { + // [maxFold+1, hi] needs no folding. + r = appendRange(r, maxFold+1, hi) + hi = maxFold + } + + // Brute force. Depend on appendRange to coalesce ranges on the fly. + for c := lo; c <= hi; c++ { + r = appendRange(r, c, c) + f := unicode.SimpleFold(c) + for f != c { + r = appendRange(r, f, f) + f = unicode.SimpleFold(f) + } + } + return r +} + +// appendClass returns the result of appending the class x to the class r. +// It assume x is clean. +func appendClass(r []int, x []int) []int { + for i := 0; i < len(x); i += 2 { + r = appendRange(r, x[i], x[i+1]) + } + return r +} + +// appendFolded returns the result of appending the case folding of the class x to the class r. +func appendFoldedClass(r []int, x []int) []int { + for i := 0; i < len(x); i += 2 { + r = appendFoldedRange(r, x[i], x[i+1]) + } + return r +} + +// appendNegatedClass returns the result of appending the negation of the class x to the class r. +// It assumes x is clean. +func appendNegatedClass(r []int, x []int) []int { + nextLo := 0 + for i := 0; i < len(x); i += 2 { + lo, hi := x[i], x[i+1] + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// appendTable returns the result of appending x to the class r. +func appendTable(r []int, x *unicode.RangeTable) []int { + for _, xr := range x.R16 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + for _, xr := range x.R32 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + return r +} + +// appendNegatedTable returns the result of appending the negation of x to the class r. +func appendNegatedTable(r []int, x *unicode.RangeTable) []int { + nextLo := 0 // lo end of next class to add + for _, xr := range x.R16 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + for _, xr := range x.R32 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// negateClass overwrites r and returns r's negation. +// It assumes the class r is already clean. +func negateClass(r []int) []int { + nextLo := 0 // lo end of next class to add + w := 0 // write index + for i := 0; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if nextLo <= lo-1 { + r[w] = nextLo + r[w+1] = lo - 1 + w += 2 + } + nextLo = hi + 1 + } + r = r[:w] + if nextLo <= unicode.MaxRune { + // It's possible for the negation to have one more + // range - this one - than the original class, so use append. + r = append(r, nextLo, unicode.MaxRune) + } + return r +} + +// ranges implements sort.Interface on a []rune. +// The choice of receiver type definition is strange +// but avoids an allocation since we already have +// a *[]int. +type ranges struct { + p *[]int +} + +func (ra ranges) Less(i, j int) bool { + p := *ra.p + i *= 2 + j *= 2 + return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] +} + +func (ra ranges) Len() int { + return len(*ra.p) / 2 +} + +func (ra ranges) Swap(i, j int) { + p := *ra.p + i *= 2 + j *= 2 + p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] +} + +func checkUTF8(s string) os.Error { + for s != "" { + rune, size := utf8.DecodeRuneInString(s) + if rune == utf8.RuneError && size == 1 { + return &Error{Code: ErrInvalidUTF8, Expr: s} + } + s = s[size:] + } + return nil +} + +func nextRune(s string) (c int, t string, err os.Error) { + c, size := utf8.DecodeRuneInString(s) + if c == utf8.RuneError && size == 1 { + return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} + } + return c, s[size:], nil +} + +func isalnum(c int) bool { + return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' +} + +func unhex(c int) int { + if '0' <= c && c <= '9' { + return c - '0' + } + if 'a' <= c && c <= 'f' { + return c - 'a' + 10 + } + if 'A' <= c && c <= 'F' { + return c - 'A' + 10 + } + return -1 +} diff --git a/libgo/go/exp/regexp/syntax/parse_test.go b/libgo/go/exp/regexp/syntax/parse_test.go new file mode 100644 index 0000000..779b9af --- /dev/null +++ b/libgo/go/exp/regexp/syntax/parse_test.go @@ -0,0 +1,350 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "bytes" + "fmt" + "testing" + "unicode" +) + +var parseTests = []struct { + Regexp string + Dump string +}{ + // Base cases + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + {``, `emp{}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, + + // Posix and Perl extensions + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never + + // Unicode, negatives, and a double negative. + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, + + // Hex, octal. + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, + + // More interesting regular expressions. + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, + + // Test precedences + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, + + // Test flattening. + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, + + // Test Perl quoted literals + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, + + // Test Perl \A and \z + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, + + // Test named captures + {`(?P<name>a)`, `cap{name:lit{a}}`}, + + // Case-folded literals + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, + + // Strings + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +// Test Parse -> Dump. +func TestParseDump(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + } + } +} + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b bytes.Buffer + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *bytes.Buffer, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.SimpleFold(r) != r { + b.WriteString("fold") + break + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} + +func mkCharClass(f func(int) bool) string { + re := &Regexp{Op: OpCharClass} + lo := -1 + for i := 0; i <= unicode.MaxRune; i++ { + if f(i) { + if lo < 0 { + lo = i + } + } else { + if lo >= 0 { + re.Rune = append(re.Rune, lo, i-1) + lo = -1 + } + } + } + if lo >= 0 { + re.Rune = append(re.Rune, lo, unicode.MaxRune) + } + return dump(re) +} + +func isUpperFold(rune int) bool { + if unicode.IsUpper(rune) { + return true + } + c := unicode.SimpleFold(rune) + for c != rune { + if unicode.IsUpper(c) { + return true + } + c = unicode.SimpleFold(c) + } + return false +} + +func TestFoldConstants(t *testing.T) { + last := -1 + for i := 0; i <= unicode.MaxRune; i++ { + if unicode.SimpleFold(i) == i { + continue + } + if last == -1 && minFold != i { + t.Errorf("minFold=%#U should be %#U", minFold, i) + } + last = i + } + if maxFold != last { + t.Errorf("maxFold=%#U should be %#U", maxFold, last) + } +} + +func TestAppendRangeCollapse(t *testing.T) { + // AppendRange should collapse each of the new ranges + // into the earlier ones (it looks back two ranges), so that + // the slice never grows very large. + // Note that we are not calling cleanClass. + var r []int + for i := 'A'; i <= 'Z'; i++ { + r = appendRange(r, i, i) + r = appendRange(r, i+'a'-'A', i+'a'-'A') + } + if string(r) != "AZaz" { + t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) + } +} diff --git a/libgo/go/exp/regexp/syntax/perl_groups.go b/libgo/go/exp/regexp/syntax/perl_groups.go new file mode 100644 index 0000000..05b392c --- /dev/null +++ b/libgo/go/exp/regexp/syntax/perl_groups.go @@ -0,0 +1,130 @@ +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.go + +package syntax + +var code1 = []int{ /* \d */ + 0x30, 0x39, +} + +var code2 = []int{ /* \s */ + 0x9, 0xa, + 0xc, 0xd, + 0x20, 0x20, +} + +var code3 = []int{ /* \w */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var perlGroup = map[string]charGroup{ + `\d`: {+1, code1}, + `\D`: {-1, code1}, + `\s`: {+1, code2}, + `\S`: {-1, code2}, + `\w`: {+1, code3}, + `\W`: {-1, code3}, +} +var code4 = []int{ /* [:alnum:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code5 = []int{ /* [:alpha:] */ + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code6 = []int{ /* [:ascii:] */ + 0x0, 0x7f, +} + +var code7 = []int{ /* [:blank:] */ + 0x9, 0x9, + 0x20, 0x20, +} + +var code8 = []int{ /* [:cntrl:] */ + 0x0, 0x1f, + 0x7f, 0x7f, +} + +var code9 = []int{ /* [:digit:] */ + 0x30, 0x39, +} + +var code10 = []int{ /* [:graph:] */ + 0x21, 0x7e, +} + +var code11 = []int{ /* [:lower:] */ + 0x61, 0x7a, +} + +var code12 = []int{ /* [:print:] */ + 0x20, 0x7e, +} + +var code13 = []int{ /* [:punct:] */ + 0x21, 0x2f, + 0x3a, 0x40, + 0x5b, 0x60, + 0x7b, 0x7e, +} + +var code14 = []int{ /* [:space:] */ + 0x9, 0xd, + 0x20, 0x20, +} + +var code15 = []int{ /* [:upper:] */ + 0x41, 0x5a, +} + +var code16 = []int{ /* [:word:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var code17 = []int{ /* [:xdigit:] */ + 0x30, 0x39, + 0x41, 0x46, + 0x61, 0x66, +} + +var posixGroup = map[string]charGroup{ + `[:alnum:]`: {+1, code4}, + `[:^alnum:]`: {-1, code4}, + `[:alpha:]`: {+1, code5}, + `[:^alpha:]`: {-1, code5}, + `[:ascii:]`: {+1, code6}, + `[:^ascii:]`: {-1, code6}, + `[:blank:]`: {+1, code7}, + `[:^blank:]`: {-1, code7}, + `[:cntrl:]`: {+1, code8}, + `[:^cntrl:]`: {-1, code8}, + `[:digit:]`: {+1, code9}, + `[:^digit:]`: {-1, code9}, + `[:graph:]`: {+1, code10}, + `[:^graph:]`: {-1, code10}, + `[:lower:]`: {+1, code11}, + `[:^lower:]`: {-1, code11}, + `[:print:]`: {+1, code12}, + `[:^print:]`: {-1, code12}, + `[:punct:]`: {+1, code13}, + `[:^punct:]`: {-1, code13}, + `[:space:]`: {+1, code14}, + `[:^space:]`: {-1, code14}, + `[:upper:]`: {+1, code15}, + `[:^upper:]`: {-1, code15}, + `[:word:]`: {+1, code16}, + `[:^word:]`: {-1, code16}, + `[:xdigit:]`: {+1, code17}, + `[:^xdigit:]`: {-1, code17}, +} diff --git a/libgo/go/exp/regexp/syntax/prog.go b/libgo/go/exp/regexp/syntax/prog.go new file mode 100644 index 0000000..bf85b72 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/prog.go @@ -0,0 +1,237 @@ +package syntax + +import ( + "bytes" + "strconv" +) + +// Compiled program. +// May not belong in this package, but convenient for now. + +// A Prog is a compiled regular expression program. +type Prog struct { + Inst []Inst + Start int // index of start instruction + NumCap int // number of InstCapture insts in re +} + +// An InstOp is an instruction opcode. +type InstOp uint8 + +const ( + InstAlt InstOp = iota + InstAltMatch + InstCapture + InstEmptyWidth + InstMatch + InstFail + InstNop + InstRune +) + +// An EmptyOp specifies a kind or mixture of zero-width assertions. +type EmptyOp uint8 + +const ( + EmptyBeginLine EmptyOp = 1 << iota + EmptyEndLine + EmptyBeginText + EmptyEndText + EmptyWordBoundary + EmptyNoWordBoundary +) + +// An Inst is a single instruction in a regular expression program. +type Inst struct { + Op InstOp + Out uint32 // all but InstMatch, InstFail + Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth + Rune []int +} + +func (p *Prog) String() string { + var b bytes.Buffer + dumpProg(&b, p) + return b.String() +} + +// skipNop follows any no-op or capturing instructions +// and returns the resulting pc. +func (p *Prog) skipNop(pc uint32) *Inst { + i := &p.Inst[pc] + for i.Op == InstNop || i.Op == InstCapture { + pc = i.Out + i = &p.Inst[pc] + } + return i +} + +// Prefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. +func (p *Prog) Prefix() (prefix string, complete bool) { + i := p.skipNop(uint32(p.Start)) + + // Avoid allocation of buffer if prefix is empty. + if i.Op != InstRune || len(i.Rune) != 1 { + return "", i.Op == InstMatch + } + + // Have prefix; gather characters. + var buf bytes.Buffer + for i.Op == InstRune && len(i.Rune) == 1 { + buf.WriteRune(i.Rune[0]) + i = p.skipNop(i.Out) + } + return buf.String(), i.Op == InstMatch +} + +// StartCond returns the leading empty-width conditions that must +// be true in any match. It returns ^EmptyOp(0) if no matches are possible. +func (p *Prog) StartCond() EmptyOp { + var flag EmptyOp + pc := uint32(p.Start) + i := &p.Inst[pc] +Loop: + for { + switch i.Op { + case InstEmptyWidth: + flag |= EmptyOp(i.Arg) + case InstFail: + return ^EmptyOp(0) + case InstCapture, InstNop: + // skip + default: + break Loop + } + pc = i.Out + i = &p.Inst[pc] + } + return flag +} + +// MatchRune returns true if the instruction matches (and consumes) r. +// It should only be called when i.Op == InstRune. +func (i *Inst) MatchRune(r int) bool { + rune := i.Rune + + // Special case: single-rune slice is from literal string, not char class. + // TODO: Case folding. + if len(rune) == 1 { + return r == rune[0] + } + + // Peek at the first few pairs. + // Should handle ASCII well. + for j := 0; j < len(rune) && j <= 8; j += 2 { + if r < rune[j] { + return false + } + if r <= rune[j+1] { + return true + } + } + + // Otherwise binary search. + lo := 0 + hi := len(rune) / 2 + for lo < hi { + m := lo + (hi-lo)/2 + if c := rune[2*m]; c <= r { + if r <= rune[2*m+1] { + return true + } + lo = m + 1 + } else { + hi = m + } + } + return false +} + +// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. +// Since we act on runes, it would be easy to support Unicode here. +func wordRune(rune int) bool { + return rune == '_' || + ('A' <= rune && rune <= 'Z') || + ('a' <= rune && rune <= 'z') || + ('0' <= rune && rune <= '9') +} + +// MatchEmptyWidth returns true if the instruction matches +// an empty string between the runes before and after. +// It should only be called when i.Op == InstEmptyWidth. +func (i *Inst) MatchEmptyWidth(before int, after int) bool { + switch EmptyOp(i.Arg) { + case EmptyBeginLine: + return before == '\n' || before == -1 + case EmptyEndLine: + return after == '\n' || after == -1 + case EmptyBeginText: + return before == -1 + case EmptyEndText: + return after == -1 + case EmptyWordBoundary: + return wordRune(before) != wordRune(after) + case EmptyNoWordBoundary: + return wordRune(before) == wordRune(after) + } + panic("unknown empty width arg") +} + +func (i *Inst) String() string { + var b bytes.Buffer + dumpInst(&b, i) + return b.String() +} + +func bw(b *bytes.Buffer, args ...string) { + for _, s := range args { + b.WriteString(s) + } +} + +func dumpProg(b *bytes.Buffer, p *Prog) { + for j := range p.Inst { + i := &p.Inst[j] + pc := strconv.Itoa(j) + if len(pc) < 3 { + b.WriteString(" "[len(pc):]) + } + if j == p.Start { + pc += "*" + } + bw(b, pc, "\t") + dumpInst(b, i) + bw(b, "\n") + } +} + +func u32(i uint32) string { + return strconv.Uitoa64(uint64(i)) +} + +func dumpInst(b *bytes.Buffer, i *Inst) { + switch i.Op { + case InstAlt: + bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstAltMatch: + bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstCapture: + bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) + case InstEmptyWidth: + bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) + case InstMatch: + bw(b, "match") + case InstFail: + bw(b, "fail") + case InstNop: + bw(b, "nop -> ", u32(i.Out)) + case InstRune: + if i.Rune == nil { + // shouldn't happen + bw(b, "rune <nil>") + } + bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) + } +} diff --git a/libgo/go/exp/regexp/syntax/prog_test.go b/libgo/go/exp/regexp/syntax/prog_test.go new file mode 100644 index 0000000..7be4281 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/prog_test.go @@ -0,0 +1,91 @@ +package syntax + +import ( + "testing" +) + +var compileTests = []struct { + Regexp string + Prog string +}{ + {"a", ` 0 fail + 1* rune "a" -> 2 + 2 match +`}, + {"[A-M][n-z]", ` 0 fail + 1* rune "AM" -> 2 + 2 rune "nz" -> 3 + 3 match +`}, + {"", ` 0 fail + 1* nop -> 2 + 2 match +`}, + {"a?", ` 0 fail + 1 rune "a" -> 3 + 2* alt -> 1, 3 + 3 match +`}, + {"a??", ` 0 fail + 1 rune "a" -> 3 + 2* alt -> 3, 1 + 3 match +`}, + {"a+", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 1, 3 + 3 match +`}, + {"a+?", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 3, 1 + 3 match +`}, + {"a*", ` 0 fail + 1 rune "a" -> 2 + 2* alt -> 1, 3 + 3 match +`}, + {"a*?", ` 0 fail + 1 rune "a" -> 2 + 2* alt -> 3, 1 + 3 match +`}, + {"a+b+", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 1, 3 + 3 rune "b" -> 4 + 4 alt -> 3, 5 + 5 match +`}, + {"(a+)(b+)", ` 0 fail + 1* cap 2 -> 2 + 2 rune "a" -> 3 + 3 alt -> 2, 4 + 4 cap 3 -> 5 + 5 cap 4 -> 6 + 6 rune "b" -> 7 + 7 alt -> 6, 8 + 8 cap 5 -> 9 + 9 match +`}, + {"a+|b+", ` 0 fail + 1 rune "a" -> 2 + 2 alt -> 1, 6 + 3 rune "b" -> 4 + 4 alt -> 3, 6 + 5* alt -> 1, 3 + 6 match +`}, +} + +func TestCompile(t *testing.T) { + for _, tt := range compileTests { + re, _ := Parse(tt.Regexp, Perl) + p, _ := Compile(re) + s := p.String() + if s != tt.Prog { + t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) + } + } +} diff --git a/libgo/go/exp/regexp/syntax/regexp.go b/libgo/go/exp/regexp/syntax/regexp.go new file mode 100644 index 0000000..00a4add --- /dev/null +++ b/libgo/go/exp/regexp/syntax/regexp.go @@ -0,0 +1,284 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package syntax parses regular expressions into syntax trees. +// WORK IN PROGRESS. +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( + "bytes" + "strconv" + "strings" + "unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { + Op Op // operator + Flags Flags + Sub []*Regexp // subexpressions, if any + Sub0 [1]*Regexp // storage for short Sub + Rune []int // matched runes, for OpLiteral, OpCharClass + Rune0 [2]int // storage for short Rune + Min, Max int // min, max for OpRepeat + Cap int // capturing index, for OpCapture + Name string // capturing name, for OpCapture +} + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). + +const ( + OpNoMatch Op = 1 + iota // matches no strings + OpEmptyMatch // matches empty string + OpLiteral // matches Runes sequence + OpCharClass // matches Runes interpreted as range pair list + OpAnyCharNotNL // matches any character + OpAnyChar // matches any character + OpBeginLine // matches empty string at beginning of line + OpEndLine // matches empty string at end of line + OpBeginText // matches empty string at beginning of text + OpEndText // matches empty string at end of text + OpWordBoundary // matches word boundary `\b` + OpNoWordBoundary // matches word non-boundary `\B` + OpCapture // capturing subexpression with index Cap, optional name Name + OpStar // matches Sub[0] zero or more times + OpPlus // matches Sub[0] one or more times + OpQuest // matches Sub[0] zero or one times + OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) + OpConcat // matches concatenation of Subs + OpAlternate // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// Equal returns true if x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { + if x == nil || y == nil { + return x == y + } + if x.Op != y.Op { + return false + } + switch x.Op { + case OpEndText: + // The parse flags remember whether this is \z or \Z. + if x.Flags&WasDollar != y.Flags&WasDollar { + return false + } + + case OpLiteral, OpCharClass: + if len(x.Rune) != len(y.Rune) { + return false + } + for i, r := range x.Rune { + if r != y.Rune[i] { + return false + } + } + + case OpAlternate, OpConcat: + if len(x.Sub) != len(y.Sub) { + return false + } + for i, sub := range x.Sub { + if !sub.Equal(y.Sub[i]) { + return false + } + } + + case OpStar, OpPlus, OpQuest: + if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpRepeat: + if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpCapture: + if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + } + return true +} + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *bytes.Buffer, re *Regexp) { + switch re.Op { + default: + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") + case OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case OpEmptyMatch: + b.WriteString(`(?:)`) + case OpLiteral: + if re.Flags&FoldCase != 0 { + b.WriteString(`(?i:`) + } + for _, r := range re.Rune { + escape(b, r, false) + } + if re.Flags&FoldCase != 0 { + b.WriteString(`)`) + } + case OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case OpAnyCharNotNL: + b.WriteString(`[^\n]`) + case OpAnyChar: + b.WriteRune('.') + case OpBeginLine: + b.WriteRune('^') + case OpEndLine: + b.WriteRune('$') + case OpBeginText: + b.WriteString(`\A`) + case OpEndText: + b.WriteString(`\z`) + case OpWordBoundary: + b.WriteString(`\b`) + case OpNoWordBoundary: + b.WriteString(`\B`) + case OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } + b.WriteRune(')') + case OpStar, OpPlus, OpQuest, OpRepeat: + if sub := re.Sub[0]; sub.Op > OpCapture { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case OpStar: + b.WriteRune('*') + case OpPlus: + b.WriteRune('+') + case OpQuest: + b.WriteRune('?') + case OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + case OpConcat: + for _, sub := range re.Sub { + if sub.Op == OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func (re *Regexp) String() string { + var b bytes.Buffer + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *bytes.Buffer, r int, force bool) { + if unicode.IsPrint(r) { + if strings.IndexRune(meta, r) >= 0 || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.Itob(r, 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.Itob(r, 16)) + b.WriteString(`}`) + } +} diff --git a/libgo/go/exp/regexp/syntax/simplify.go b/libgo/go/exp/regexp/syntax/simplify.go new file mode 100644 index 0000000..7239041 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/simplify.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Simplify returns a regexp equivalent to re but without counted repetitions +// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. +// The resulting regexp will execute correctly but its string representation +// will not produce the same parse tree, because capturing parentheses +// may have been duplicated or removed. For example, the simplified form +// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. +// The returned regexp may share structure with or be the original. +func (re *Regexp) Simplify() *Regexp { + if re == nil { + return nil + } + switch re.Op { + case OpCapture, OpConcat, OpAlternate: + // Simplify children, building new Regexp if children change. + nre := re + for i, sub := range re.Sub { + nsub := sub.Simplify() + if nre == re && nsub != sub { + // Start a copy. + nre = new(Regexp) + *nre = *re + nre.Rune = nil + nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) + } + if nre != re { + nre.Sub = append(nre.Sub, nsub) + } + } + return nre + + case OpStar, OpPlus, OpQuest: + sub := re.Sub[0].Simplify() + return simplify1(re.Op, re.Flags, sub, re) + + case OpRepeat: + // Special special case: x{0} matches the empty string + // and doesn't even need to consider x. + if re.Min == 0 && re.Max == 0 { + return &Regexp{Op: OpEmptyMatch} + } + + // The fun begins. + sub := re.Sub[0].Simplify() + + // x{n,} means at least n matches of x. + if re.Max == -1 { + // Special case: x{0,} is x*. + if re.Min == 0 { + return simplify1(OpStar, re.Flags, sub, nil) + } + + // Special case: x{1,} is x+. + if re.Min == 1 { + return simplify1(OpPlus, re.Flags, sub, nil) + } + + // General case: x{4,} is xxxx+. + nre := &Regexp{Op: OpConcat} + nre.Sub = nre.Sub0[:0] + for i := 0; i < re.Min-1; i++ { + nre.Sub = append(nre.Sub, sub) + } + nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) + return nre + } + + // Special case x{0} handled above. + + // Special case: x{1} is just x. + if re.Min == 1 && re.Max == 1 { + return sub + } + + // General case: x{n,m} means n copies of x and m copies of x? + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. + var prefix *Regexp + if re.Min > 0 { + prefix = &Regexp{Op: OpConcat} + prefix.Sub = prefix.Sub0[:0] + for i := 0; i < re.Min; i++ { + prefix.Sub = append(prefix.Sub, sub) + } + } + + // Build and attach suffix: (x(x(x)?)?)? + if re.Max > re.Min { + suffix := simplify1(OpQuest, re.Flags, sub, nil) + for i := re.Min + 1; i < re.Max; i++ { + nre2 := &Regexp{Op: OpConcat} + nre2.Sub = append(nre2.Sub0[:0], sub, suffix) + suffix = simplify1(OpQuest, re.Flags, nre2, nil) + } + if prefix == nil { + return suffix + } + prefix.Sub = append(prefix.Sub, suffix) + } + if prefix != nil { + return prefix + } + + // Some degenerate case like min > max or min < max < 0. + // Handle as impossible match. + return &Regexp{Op: OpNoMatch} + } + + return re +} + +// simplify1 implements Simplify for the unary OpStar, +// OpPlus, and OpQuest operators. It returns the simple regexp +// equivalent to +// +// Regexp{Op: op, Flags: flags, Sub: {sub}} +// +// under the assumption that sub is already simple, and +// without first allocating that structure. If the regexp +// to be returned turns out to be equivalent to re, simplify1 +// returns re instead. +// +// simplify1 is factored out of Simplify because the implementation +// for other operators generates these unary expressions. +// Letting them call simplify1 makes sure the expressions they +// generate are simple. +func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if sub.Op == OpEmptyMatch { + return sub + } + // The operators are idempotent if the flags match. + if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { + return sub + } + if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { + return re + } + + re = &Regexp{Op: op, Flags: flags} + re.Sub = append(re.Sub0[:0], sub) + return re +} diff --git a/libgo/go/exp/regexp/syntax/simplify_test.go b/libgo/go/exp/regexp/syntax/simplify_test.go new file mode 100644 index 0000000..c8cec21 --- /dev/null +++ b/libgo/go/exp/regexp/syntax/simplify_test.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var simplifyTests = []struct { + Regexp string + Simple string +}{ + // Already-simple constructs + {`a`, `a`}, + {`ab`, `ab`}, + {`a|b`, `[a-b]`}, + {`ab|cd`, `ab|cd`}, + {`(ab)*`, `(ab)*`}, + {`(ab)+`, `(ab)+`}, + {`(ab)?`, `(ab)?`}, + {`.`, `.`}, + {`^`, `^`}, + {`$`, `$`}, + {`[ac]`, `[ac]`}, + {`[^ac]`, `[^ac]`}, + + // Posix character classes + {`[[:alnum:]]`, `[0-9A-Za-z]`}, + {`[[:alpha:]]`, `[A-Za-z]`}, + {`[[:blank:]]`, `[\t ]`}, + {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, + {`[[:digit:]]`, `[0-9]`}, + {`[[:graph:]]`, `[!-~]`}, + {`[[:lower:]]`, `[a-z]`}, + {`[[:print:]]`, `[ -~]`}, + {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, + {`[[:space:]]`, `[\t-\r ]`}, + {`[[:upper:]]`, `[A-Z]`}, + {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, + + // Perl character classes + {`\d`, `[0-9]`}, + {`\s`, `[\t-\n\f-\r ]`}, + {`\w`, `[0-9A-Z_a-z]`}, + {`\D`, `[^0-9]`}, + {`\S`, `[^\t-\n\f-\r ]`}, + {`\W`, `[^0-9A-Z_a-z]`}, + {`[\d]`, `[0-9]`}, + {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\w]`, `[0-9A-Z_a-z]`}, + {`[\D]`, `[^0-9]`}, + {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\W]`, `[^0-9A-Z_a-z]`}, + + // Posix repetitions + {`a{1}`, `a`}, + {`a{2}`, `aa`}, + {`a{5}`, `aaaaa`}, + {`a{0,1}`, `a?`}, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version with fewer parens. + {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? + {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? + {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,2}`, `(?:aa?)?`}, // (aa?)? + {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? + {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,}`, `a*`}, + {`a{1,}`, `a+`}, + {`a{2,}`, `aa+`}, + {`a{5,}`, `aaaaa+`}, + + // Test that operators simplify their arguments. + {`(?:a{1,}){1,}`, `a+`}, + {`(a{1,}b{1,})`, `(a+b+)`}, + {`a{1,}|b{1,}`, `a+|b+`}, + {`(?:a{1,})*`, `(?:a+)*`}, + {`(?:a{1,})+`, `a+`}, + {`(?:a{1,})?`, `(?:a+)?`}, + {``, `(?:)`}, + {`a{0}`, `(?:)`}, + + // Character class simplification + {`[ab]`, `[a-b]`}, + {`[a-za-za-z]`, `[a-z]`}, + {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, + {`[ABCDEFGH]`, `[A-H]`}, + {`[AB-CD-EF-GH]`, `[A-H]`}, + {`[W-ZP-XE-R]`, `[E-Z]`}, + {`[a-ee-gg-m]`, `[a-m]`}, + {`[a-ea-ha-m]`, `[a-m]`}, + {`[a-ma-ha-e]`, `[a-m]`}, + {`[a-zA-Z0-9 -~]`, `[ -~]`}, + + // Empty character classes + {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, + + // Full character classes + {`[[:cntrl:][:^cntrl:]]`, `.`}, + + // Unicode case folding. + {`(?i)A`, `(?i:A)`}, + {`(?i)a`, `(?i:a)`}, + {`(?i)[A]`, `(?i:A)`}, + {`(?i)[a]`, `(?i:A)`}, + {`(?i)K`, `(?i:K)`}, + {`(?i)k`, `(?i:k)`}, + {`(?i)\x{212a}`, "(?i:\u212A)"}, + {`(?i)[K]`, "[Kk\u212A]"}, + {`(?i)[k]`, "[Kk\u212A]"}, + {`(?i)[\x{212a}]`, "[Kk\u212A]"}, + {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, + {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, + {`(?i)[\x00-\x{10FFFF}]`, `.`}, + + // Empty string as a regular expression. + // The empty string must be preserved inside parens in order + // to make submatches work right, so these tests are less + // interesting than they might otherwise be. String inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + {`(a|b|)`, `([a-b]|(?:))`}, + {`(|)`, `()`}, + {`a()`, `a()`}, + {`(()|())`, `(()|())`}, + {`(a|)`, `(a|(?:))`}, + {`ab()cd()`, `ab()cd()`}, + {`()`, `()`}, + {`()*`, `()*`}, + {`()+`, `()+`}, + {`()?`, `()?`}, + {`(){0}`, `(?:)`}, + {`(){1}`, `()`}, + {`(){1,}`, `()+`}, + {`(){0,2}`, `(?:()()?)?`}, +} + +func TestSimplify(t *testing.T) { + for _, tt := range simplifyTests { + re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) + if err != nil { + t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) + continue + } + s := re.Simplify().String() + if s != tt.Simple { + t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) + } + } +} |