diff options
Diffstat (limited to 'libgo/go/regexp')
-rw-r--r-- | libgo/go/regexp/all_test.go | 80 | ||||
-rw-r--r-- | libgo/go/regexp/find_test.go | 7 | ||||
-rw-r--r-- | libgo/go/regexp/regexp.go | 132 |
3 files changed, 159 insertions, 60 deletions
diff --git a/libgo/go/regexp/all_test.go b/libgo/go/regexp/all_test.go index d5a0e7d..aed7330 100644 --- a/libgo/go/regexp/all_test.go +++ b/libgo/go/regexp/all_test.go @@ -38,6 +38,8 @@ type stringError struct { var bad_re = []stringError{ {`*`, ErrBareClosure}, + {`+`, ErrBareClosure}, + {`?`, ErrBareClosure}, {`(abc`, ErrUnmatchedLpar}, {`abc)`, ErrUnmatchedRpar}, {`x[a-z`, ErrUnmatchedLbkt}, @@ -47,7 +49,6 @@ var bad_re = []stringError{ {`a**`, ErrBadClosure}, {`a*+`, ErrBadClosure}, {`a??`, ErrBadClosure}, - {`*`, ErrBareClosure}, {`\x`, ErrBadBackslash}, } @@ -229,18 +230,21 @@ func TestReplaceAllFunc(t *testing.T) { } } -type QuoteMetaTest struct { - pattern, output string +type MetaTest struct { + pattern, output, literal string + isLiteral bool } -var quoteMetaTests = []QuoteMetaTest{ - {``, ``}, - {`foo`, `foo`}, - {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[{\]}\\\|,<\.>/\?~`}, +var metaTests = []MetaTest{ + {``, ``, ``, true}, + {`foo`, `foo`, `foo`, true}, + {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator + {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[{\]}\\\|,<\.>/\?~`, `!@#`, false}, } func TestQuoteMeta(t *testing.T) { - for _, tc := range quoteMetaTests { + for _, tc := range metaTests { // Verify that QuoteMeta returns the expected string. quoted := QuoteMeta(tc.pattern) if quoted != tc.output { @@ -269,6 +273,20 @@ func TestQuoteMeta(t *testing.T) { } } +func TestLiteralPrefix(t *testing.T) { + for _, tc := range metaTests { + // Literal method needs to scan the pattern. + re := MustCompile(tc.pattern) + str, complete := re.LiteralPrefix() + if complete != tc.isLiteral { + t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) + } + if str != tc.literal { + t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) + } + } +} + type numSubexpCase struct { input string expected int @@ -360,3 +378,49 @@ func BenchmarkReplaceAll(b *testing.B) { re.ReplaceAllString(x, "") } } + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/libgo/go/regexp/find_test.go b/libgo/go/regexp/find_test.go index 07f5586..1690711 100644 --- a/libgo/go/regexp/find_test.go +++ b/libgo/go/regexp/find_test.go @@ -78,6 +78,7 @@ var findTests = []FindTest{ {`axxb$`, "axxcb", nil}, {`data`, "daXY data", build(1, 5, 9)}, {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + {`zx+`, "zzx", build(1, 1, 3)}, // can backslash-escape any punctuation {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, @@ -119,7 +120,11 @@ func build(n int, x ...int) [][]int { func TestFind(t *testing.T) { for _, test := range findTests { - result := MustCompile(test.pat).Find([]byte(test.text)) + re := MustCompile(test.pat) + if re.String() != test.pat { + t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) + } + result := re.Find([]byte(test.text)) switch { case len(test.matches) == 0 && len(result) == 0: // ok diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go index 2c041cd..d274ccd 100644 --- a/libgo/go/regexp/regexp.go +++ b/libgo/go/regexp/regexp.go @@ -283,6 +283,24 @@ func escape(c int) int { return -1 } +func (p *parser) checkBackslash() int { + c := p.c() + if c == '\\' { + c = p.nextc() + switch { + case c == endOfFile: + p.error(ErrExtraneousBackslash) + case ispunct(c): + // c is as delivered + case escape(c) >= 0: + c = int(escaped[escape(c)]) + default: + p.error(ErrBadBackslash) + } + } + return c +} + func (p *parser) charClass() *instr { i := newCharClass() cc := i.cclass @@ -314,20 +332,8 @@ func (p *parser) charClass() *instr { return i case '-': // do this before backslash processing p.error(ErrBadRange) - case '\\': - c = p.nextc() - switch { - case c == endOfFile: - p.error(ErrExtraneousBackslash) - case ispunct(c): - // c is as delivered - case escape(c) >= 0: - c = int(escaped[escape(c)]) - default: - p.error(ErrBadBackslash) - } - fallthrough default: + c = p.checkBackslash() p.nextc() switch { case left < 0: // first of pair @@ -345,14 +351,14 @@ func (p *parser) charClass() *instr { } } } - return nil + panic("unreachable") } func (p *parser) term() (start, end *instr) { switch c := p.c(); c { case '|', endOfFile: return nil, nil - case '*', '+': + case '*', '+', '?': p.error(ErrBareClosure) case ')': if p.nlpar == 0 { @@ -407,20 +413,8 @@ func (p *parser) term() (start, end *instr) { } bra.next = start return bra, ebra - case '\\': - c = p.nextc() - switch { - case c == endOfFile: - p.error(ErrExtraneousBackslash) - case ispunct(c): - // c is as delivered - case escape(c) >= 0: - c = int(escaped[escape(c)]) - default: - p.error(ErrBadBackslash) - } - fallthrough default: + c = p.checkBackslash() p.nextc() start = &instr{kind: iChar, char: c} p.re.add(start) @@ -571,15 +565,20 @@ func (re *Regexp) doParse() { } } -// Extract regular text from the beginning of the pattern. +// Extract regular text from the beginning of the pattern, +// possibly after a leading iBOT. // That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte var utf = make([]byte, utf8.UTFMax) var inst *instr - // First instruction is start; skip that. + // First instruction is start; skip that. Also skip any initial iBOT. + inst = re.inst[0].next + for inst.kind == iBOT { + inst = inst.next + } Loop: - for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next { + for ; inst.kind != iEnd; inst = inst.next { // stop if this is not a char if inst.kind != iChar { break @@ -590,7 +589,7 @@ Loop: case iBOT, iEOT, iAlt: break Loop } - n := utf8.EncodeRune(inst.char, utf) + n := utf8.EncodeRune(utf, inst.char) b = append(b, utf[0:n]...) } // point prefixStart instruction to first non-CHAR after prefix @@ -599,6 +598,11 @@ Loop: re.prefix = string(b) } +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.expr +} + // Compile parses a regular expression and returns, if successful, a Regexp // object that can be used to match against text. func Compile(str string) (regexp *Regexp, error os.Error) { @@ -743,34 +747,46 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { if bytestr != nil { end = len(bytestr) } + anchored := re.inst[0].next.kind == iBOT + if anchored && pos > 0 { + return nil + } // fast check for initial plain substring - prefixed := false // has this iteration begun by skipping a prefix? if re.prefix != "" { - var advance int - if bytestr == nil { - advance = strings.Index(str[pos:], re.prefix) + advance := 0 + if anchored { + if bytestr == nil { + if !strings.HasPrefix(str, re.prefix) { + return nil + } + } else { + if !bytes.HasPrefix(bytestr, re.prefixBytes) { + return nil + } + } } else { - advance = bytes.Index(bytestr[pos:], re.prefixBytes) + if bytestr == nil { + advance = strings.Index(str[pos:], re.prefix) + } else { + advance = bytes.Index(bytestr[pos:], re.prefixBytes) + } } if advance == -1 { return nil } - pos += advance + len(re.prefix) - prefixed = true + pos += advance } arena := &matchArena{nil, 2 * (re.nbra + 1)} - for pos <= end { - if !found { + for startPos := pos; pos <= end; { + if !found && (pos == startPos || !anchored) { // prime the pump if we haven't seen a match yet match := arena.noMatch() match.m[0] = pos - if prefixed { - s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end) - prefixed = false // next iteration should start at beginning of machine. - } else { - s[out] = arena.addState(s[out], re.start.next, false, match, pos, end) - } + s[out] = arena.addState(s[out], re.start.next, false, match, pos, end) arena.free(match) // if addState saved it, ref was incremented + } else if len(s[out]) == 0 { + // machine has completed + break } in, out = out, in // old out state is new in state // clear out old state @@ -779,10 +795,6 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { arena.free(state.match) } s[out] = old[0:0] // truncate state vector - if found && len(s[in]) == 0 { - // machine has completed - break - } charwidth := 1 c := endOfFile if pos < end { @@ -844,6 +856,24 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { return final.match.m } +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re. It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { + c := make([]int, len(re.inst)-2) // minus start and end. + // First instruction is start; skip that. + i := 0 + for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next { + // stop if this is not a char + if inst.kind != iChar { + return string(c[:i]), false + } + c[i] = inst.char + i++ + } + return string(c[:i]), true +} + // MatchString returns whether the Regexp matches the string s. // The return value is a boolean: true for match, false for no match. func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } |