aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/regexp
diff options
context:
space:
mode:
authorIan Lance Taylor <ian@gcc.gnu.org>2011-03-16 23:05:44 +0000
committerIan Lance Taylor <ian@gcc.gnu.org>2011-03-16 23:05:44 +0000
commit5133f00ef8baab894d92de1e8b8baae59815a8b6 (patch)
tree44176975832a3faf1626836e70c97d5edd674122 /libgo/go/regexp
parentf617201f55938fc89b532f2240bdf77bea946471 (diff)
downloadgcc-5133f00ef8baab894d92de1e8b8baae59815a8b6.zip
gcc-5133f00ef8baab894d92de1e8b8baae59815a8b6.tar.gz
gcc-5133f00ef8baab894d92de1e8b8baae59815a8b6.tar.bz2
Update to current version of Go library (revision 94d654be2064).
From-SVN: r171076
Diffstat (limited to 'libgo/go/regexp')
-rw-r--r--libgo/go/regexp/all_test.go8
-rw-r--r--libgo/go/regexp/find_test.go17
-rw-r--r--libgo/go/regexp/regexp.go296
3 files changed, 242 insertions, 79 deletions
diff --git a/libgo/go/regexp/all_test.go b/libgo/go/regexp/all_test.go
index aed7330..c7ee4c8 100644
--- a/libgo/go/regexp/all_test.go
+++ b/libgo/go/regexp/all_test.go
@@ -316,9 +316,9 @@ func TestNumSubexp(t *testing.T) {
}
func BenchmarkLiteral(b *testing.B) {
- x := strings.Repeat("x", 50)
+ x := strings.Repeat("x", 50) + "y"
b.StopTimer()
- re := MustCompile(x)
+ re := MustCompile("y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
@@ -329,9 +329,9 @@ func BenchmarkLiteral(b *testing.B) {
}
func BenchmarkNotLiteral(b *testing.B) {
- x := strings.Repeat("x", 49)
+ x := strings.Repeat("x", 50) + "y"
b.StopTimer()
- re := MustCompile("^" + x)
+ re := MustCompile(".y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
diff --git a/libgo/go/regexp/find_test.go b/libgo/go/regexp/find_test.go
index 1690711..83b249e 100644
--- a/libgo/go/regexp/find_test.go
+++ b/libgo/go/regexp/find_test.go
@@ -6,6 +6,7 @@ package regexp
import (
"fmt"
+ "strings"
"testing"
)
@@ -191,6 +192,12 @@ func TestFindStringIndex(t *testing.T) {
}
}
+func TestFindReaderIndex(t *testing.T) {
+ for _, test := range findTests {
+ testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
+ }
+}
+
// Now come the simple All cases.
func TestFindAll(t *testing.T) {
@@ -381,12 +388,18 @@ func TestFindSubmatchIndex(t *testing.T) {
}
}
-func TestFindStringSubmatchndex(t *testing.T) {
+func TestFindStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
}
}
+func TestFindReaderSubmatchIndex(t *testing.T) {
+ for _, test := range findTests {
+ testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
+ }
+}
+
// Now come the monster AllSubmatch cases.
func TestFindAllSubmatch(t *testing.T) {
@@ -452,7 +465,7 @@ func TestFindAllSubmatchIndex(t *testing.T) {
}
}
-func TestFindAllStringSubmatchndex(t *testing.T) {
+func TestFindAllStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
}
diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go
index d274ccd..e3221ac 100644
--- a/libgo/go/regexp/regexp.go
+++ b/libgo/go/regexp/regexp.go
@@ -54,6 +54,16 @@
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
//
+// There is also a subset of the methods that can be applied to text read
+// from a RuneReader:
+//
+// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
+//
+// This set may grow. Note that regular expression matches may need to
+// examine text beyond the text returned by a match, so the methods that
+// match text from a RuneReader may read arbitrarily far into the input
+// before returning.
+//
// (There are a few other methods that do not match this pattern.)
//
package regexp
@@ -231,13 +241,13 @@ func (p *parser) error(err Error) {
panic(err)
}
-const endOfFile = -1
+const endOfText = -1
func (p *parser) c() int { return p.ch }
func (p *parser) nextc() int {
if p.pos >= len(p.re.expr) {
- p.ch = endOfFile
+ p.ch = endOfText
} else {
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
p.ch = c
@@ -288,7 +298,7 @@ func (p *parser) checkBackslash() int {
if c == '\\' {
c = p.nextc()
switch {
- case c == endOfFile:
+ case c == endOfText:
p.error(ErrExtraneousBackslash)
case ispunct(c):
// c is as delivered
@@ -311,7 +321,7 @@ func (p *parser) charClass() *instr {
left := -1
for {
switch c := p.c(); c {
- case ']', endOfFile:
+ case ']', endOfText:
if left >= 0 {
p.error(ErrBadRange)
}
@@ -356,7 +366,7 @@ func (p *parser) charClass() *instr {
func (p *parser) term() (start, end *instr) {
switch c := p.c(); c {
- case '|', endOfFile:
+ case '|', endOfText:
return nil, nil
case '*', '+', '?':
p.error(ErrBareClosure)
@@ -638,8 +648,11 @@ func (re *Regexp) NumSubexp() int { return re.nbra }
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
- head *matchVec
- len int // length of match vector
+ head *matchVec
+ len int // length of match vector
+ pos int
+ atBOT bool // whether we're at beginning of text
+ atEOT bool // whether we're at end of text
}
type matchVec struct {
@@ -699,21 +712,21 @@ type state struct {
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state.
-func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state {
+func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
switch inst.kind {
case iBOT:
- if pos == 0 {
- s = a.addState(s, inst.next, prefixed, match, pos, end)
+ if a.atBOT {
+ s = a.addState(s, inst.next, prefixed, match)
}
return s
case iEOT:
- if pos == end {
- s = a.addState(s, inst.next, prefixed, match, pos, end)
+ if a.atEOT {
+ s = a.addState(s, inst.next, prefixed, match)
}
return s
case iBra:
- match.m[inst.braNum] = pos
- s = a.addState(s, inst.next, prefixed, match, pos, end)
+ match.m[inst.braNum] = a.pos
+ s = a.addState(s, inst.next, prefixed, match)
return s
}
l := len(s)
@@ -727,62 +740,157 @@ func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matc
s = append(s, state{inst, prefixed, match})
match.ref++
if inst.kind == iAlt {
- s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end)
+ s = a.addState(s, inst.left, prefixed, a.copy(match))
// give other branch a copy of this match vector
- s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end)
+ s = a.addState(s, inst.next, prefixed, a.copy(match))
}
return s
}
-// Accepts either string or bytes - the logic is identical either way.
-// If bytes == nil, scan str.
-func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
+// input abstracts different representations of the input text. It provides
+// one-character lookahead.
+type input interface {
+ step(pos int) (rune int, width int) // advance one rune
+ canCheckPrefix() bool // can we look ahead without losing info?
+ hasPrefix(re *Regexp) bool
+ index(re *Regexp, pos int) int
+}
+
+// inputString scans a string.
+type inputString struct {
+ str string
+}
+
+func newInputString(str string) *inputString {
+ return &inputString{str: str}
+}
+
+func (i *inputString) step(pos int) (int, int) {
+ if pos < len(i.str) {
+ return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
+ }
+ return endOfText, 0
+}
+
+func (i *inputString) canCheckPrefix() bool {
+ return true
+}
+
+func (i *inputString) hasPrefix(re *Regexp) bool {
+ return strings.HasPrefix(i.str, re.prefix)
+}
+
+func (i *inputString) index(re *Regexp, pos int) int {
+ return strings.Index(i.str[pos:], re.prefix)
+}
+
+// inputBytes scans a byte slice.
+type inputBytes struct {
+ str []byte
+}
+
+func newInputBytes(str []byte) *inputBytes {
+ return &inputBytes{str: str}
+}
+
+func (i *inputBytes) step(pos int) (int, int) {
+ if pos < len(i.str) {
+ return utf8.DecodeRune(i.str[pos:len(i.str)])
+ }
+ return endOfText, 0
+}
+
+func (i *inputBytes) canCheckPrefix() bool {
+ return true
+}
+
+func (i *inputBytes) hasPrefix(re *Regexp) bool {
+ return bytes.HasPrefix(i.str, re.prefixBytes)
+}
+
+func (i *inputBytes) index(re *Regexp, pos int) int {
+ return bytes.Index(i.str[pos:], re.prefixBytes)
+}
+
+// inputReader scans a RuneReader.
+type inputReader struct {
+ r io.RuneReader
+ atEOT bool
+ pos int
+}
+
+func newInputReader(r io.RuneReader) *inputReader {
+ return &inputReader{r: r}
+}
+
+func (i *inputReader) step(pos int) (int, int) {
+ if !i.atEOT && pos != i.pos {
+ return endOfText, 0
+
+ }
+ r, w, err := i.r.ReadRune()
+ if err != nil {
+ i.atEOT = true
+ return endOfText, 0
+ }
+ i.pos += w
+ return r, w
+}
+
+func (i *inputReader) canCheckPrefix() bool {
+ return false
+}
+
+func (i *inputReader) hasPrefix(re *Regexp) bool {
+ return false
+}
+
+func (i *inputReader) index(re *Regexp, pos int) int {
+ return -1
+}
+
+// Search match starting from pos bytes into the input.
+func (re *Regexp) doExecute(i input, pos int) []int {
var s [2][]state
s[0] = make([]state, 0, 10)
s[1] = make([]state, 0, 10)
in, out := 0, 1
var final state
found := false
- end := len(str)
- if bytestr != nil {
- end = len(bytestr)
- }
anchored := re.inst[0].next.kind == iBOT
if anchored && pos > 0 {
return nil
}
// fast check for initial plain substring
- if re.prefix != "" {
+ if i.canCheckPrefix() && re.prefix != "" {
advance := 0
if anchored {
- if bytestr == nil {
- if !strings.HasPrefix(str, re.prefix) {
- return nil
- }
- } else {
- if !bytes.HasPrefix(bytestr, re.prefixBytes) {
- return nil
- }
+ if !i.hasPrefix(re) {
+ return nil
}
} else {
- if bytestr == nil {
- advance = strings.Index(str[pos:], re.prefix)
- } else {
- advance = bytes.Index(bytestr[pos:], re.prefixBytes)
+ advance = i.index(re, pos)
+ if advance == -1 {
+ return nil
}
}
- if advance == -1 {
- return nil
- }
pos += advance
}
- arena := &matchArena{nil, 2 * (re.nbra + 1)}
- for startPos := pos; pos <= end; {
+ // We look one character ahead so we can match $, which checks whether
+ // we are at EOT.
+ nextChar, nextWidth := i.step(pos)
+ arena := &matchArena{
+ len: 2 * (re.nbra + 1),
+ pos: pos,
+ atBOT: pos == 0,
+ atEOT: nextChar == endOfText,
+ }
+ for c, startPos := 0, pos; c != endOfText; {
if !found && (pos == startPos || !anchored) {
// prime the pump if we haven't seen a match yet
match := arena.noMatch()
match.m[0] = pos
- s[out] = arena.addState(s[out], re.start.next, false, match, pos, end)
+ s[out] = arena.addState(s[out], re.start.next, false, match)
arena.free(match) // if addState saved it, ref was incremented
} else if len(s[out]) == 0 {
// machine has completed
@@ -795,35 +903,32 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
arena.free(state.match)
}
s[out] = old[0:0] // truncate state vector
- charwidth := 1
- c := endOfFile
- if pos < end {
- if bytestr == nil {
- c, charwidth = utf8.DecodeRuneInString(str[pos:end])
- } else {
- c, charwidth = utf8.DecodeRune(bytestr[pos:end])
- }
- }
- pos += charwidth
+ c = nextChar
+ thisPos := pos
+ pos += nextWidth
+ nextChar, nextWidth = i.step(pos)
+ arena.atEOT = nextChar == endOfText
+ arena.atBOT = false
+ arena.pos = pos
for _, st := range s[in] {
switch st.inst.kind {
case iBOT:
case iEOT:
case iChar:
if c == st.inst.char {
- s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
}
case iCharClass:
if st.inst.cclass.matches(c) {
- s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
}
case iAny:
- if c != endOfFile {
- s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+ if c != endOfText {
+ s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
}
case iNotNL:
- if c != endOfFile && c != '\n' {
- s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
+ if c != endOfText && c != '\n' {
+ s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
}
case iBra:
case iAlt:
@@ -831,13 +936,13 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
// choose leftmost longest
if !found || // first
st.match.m[0] < final.match.m[0] || // leftmost
- (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
+ (st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final = st
final.match.ref++
- final.match.m[1] = pos - charwidth
+ final.match.m[1] = thisPos
}
found = true
default:
@@ -874,14 +979,31 @@ func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
return string(c[:i]), true
}
+// MatchReader returns whether the Regexp matches the text read by the
+// RuneReader. The return value is a boolean: true for match, false for no
+// match.
+func (re *Regexp) MatchReader(r io.RuneReader) bool {
+ return len(re.doExecute(newInputReader(r), 0)) > 0
+}
+
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
-func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
+func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
-func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
+func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }
+// MatchReader checks whether a textual regular expression matches the text
+// read by the RuneReader. More complicated queries need to use Compile and
+// the full Regexp interface.
+func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
+ re, err := Compile(pattern)
+ if err != nil {
+ return false, err
+ }
+ return re.MatchReader(r), nil
+}
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
@@ -921,7 +1043,7 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
- a := re.doExecute(src, nil, searchPos)
+ a := re.doExecute(newInputString(src), searchPos)
if len(a) == 0 {
break // no more matches
}
@@ -973,7 +1095,7 @@ func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
- a := re.doExecute("", src, searchPos)
+ a := re.doExecute(newInputBytes(src), searchPos)
if len(a) == 0 {
break // no more matches
}
@@ -1038,7 +1160,13 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
}
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
- matches := re.doExecute(s, b, pos)
+ var in input
+ if b == nil {
+ in = newInputString(s)
+ } else {
+ in = newInputBytes(b)
+ }
+ matches := re.doExecute(in, pos)
if len(matches) == 0 {
break
}
@@ -1052,6 +1180,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
accept = false
}
var width int
+ // TODO: use step()
if b == nil {
_, width = utf8.DecodeRuneInString(s[pos:end])
} else {
@@ -1077,7 +1206,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
- a := re.doExecute("", b, 0)
+ a := re.doExecute(newInputBytes(b), 0)
if a == nil {
return nil
}
@@ -1089,7 +1218,7 @@ func (re *Regexp) Find(b []byte) []byte {
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
- a := re.doExecute("", b, 0)
+ a := re.doExecute(newInputBytes(b), 0)
if a == nil {
return nil
}
@@ -1102,7 +1231,7 @@ func (re *Regexp) FindIndex(b []byte) (loc []int) {
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
- a := re.doExecute(s, nil, 0)
+ a := re.doExecute(newInputString(s), 0)
if a == nil {
return ""
}
@@ -1114,7 +1243,19 @@ func (re *Regexp) FindString(s string) string {
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int {
- a := re.doExecute(s, nil, 0)
+ a := re.doExecute(newInputString(s), 0)
+ if a == nil {
+ return nil
+ }
+ return a[0:2]
+}
+
+// FindReaderIndex returns a two-element slice of integers defining the
+// location of the leftmost match of the regular expression in text read from
+// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
+// value of nil indicates no match.
+func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
+ a := re.doExecute(newInputReader(r), 0)
if a == nil {
return nil
}
@@ -1127,7 +1268,7 @@ func (re *Regexp) FindStringIndex(s string) []int {
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
- a := re.doExecute("", b, 0)
+ a := re.doExecute(newInputBytes(b), 0)
if a == nil {
return nil
}
@@ -1146,7 +1287,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
- return re.doExecute("", b, 0)
+ return re.doExecute(newInputBytes(b), 0)
}
// FindStringSubmatch returns a slice of strings holding the text of the
@@ -1155,7 +1296,7 @@ func (re *Regexp) FindSubmatchIndex(b []byte) []int {
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
- a := re.doExecute(s, nil, 0)
+ a := re.doExecute(newInputString(s), 0)
if a == nil {
return nil
}
@@ -1174,7 +1315,16 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
- return re.doExecute(s, nil, 0)
+ return re.doExecute(newInputString(s), 0)
+}
+
+// FindReaderSubmatchIndex returns a slice holding the index pairs
+// identifying the leftmost match of the regular expression of text read by
+// the RuneReader, and the matches, if any, of its subexpressions, as defined
+// by the 'Submatch' and 'Index' descriptions in the package comment. A
+// return value of nil indicates no match.
+func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
+ return re.doExecute(newInputReader(r), 0)
}
const startSize = 10 // The size at which to start a slice in the 'All' routines.