diff options
Diffstat (limited to 'libgo/go/regexp/regexp.go')
-rw-r--r-- | libgo/go/regexp/regexp.go | 284 |
1 files changed, 220 insertions, 64 deletions
diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go index 7aebd37..54c5377 100644 --- a/libgo/go/regexp/regexp.go +++ b/libgo/go/regexp/regexp.go @@ -61,6 +61,7 @@ import ( "strconv" "strings" "sync" + "unicode" "unicode/utf8" ) @@ -416,41 +417,79 @@ func Match(pattern string, b []byte) (matched bool, error error) { return re.Match(b), nil } -// ReplaceAllString returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement string. +// ReplaceAllString returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAllString(src, repl string) string { - return re.ReplaceAllStringFunc(src, func(string) string { return repl }) + n := 2 + if strings.Index(repl, "$") >= 0 { + n = 2 * (re.numSubexp + 1) + } + b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { + return re.expand(dst, repl, nil, src, match) + }) + return string(b) } -// ReplaceAllStringFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched string). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. +// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { + return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + })) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched substring. The replacement returned by repl is substituted +// directly, without using Expand. func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) + return string(b) +} + +func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { lastMatchEnd := 0 // end position of the most recent match searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, nil, src, searchPos, 2) + var buf []byte + var endPos int + if bsrc != nil { + endPos = len(bsrc) + } else { + endPos = len(src) + } + for searchPos <= endPos { + a := re.doExecute(nil, bsrc, src, searchPos, nmatch) if len(a) == 0 { break // no more matches } // Copy the unmatched characters before this match. - io.WriteString(buf, src[lastMatchEnd:a[0]]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:a[0]]...) + } else { + buf = append(buf, src[lastMatchEnd:a[0]]...) + } // Now insert a copy of the replacement string, but not for a // match of the empty string immediately after another match. // (Otherwise, we get double replacement for patterns that // match both empty and nonempty strings.) if a[1] > lastMatchEnd || a[0] == 0 { - io.WriteString(buf, repl(src[a[0]:a[1]])) + buf = repl(buf, a) } lastMatchEnd = a[1] // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRuneInString(src[searchPos:]) + var width int + if bsrc != nil { + _, width = utf8.DecodeRune(bsrc[searchPos:]) + } else { + _, width = utf8.DecodeRuneInString(src[searchPos:]) + } if searchPos+width > a[1] { searchPos += width } else if searchPos+1 > a[1] { @@ -463,61 +502,50 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str } // Copy the unmatched characters after the last match. - io.WriteString(buf, src[lastMatchEnd:]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:]...) + } else { + buf = append(buf, src[lastMatchEnd:]...) + } - return buf.String() + return buf } -// ReplaceAll returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement text. +// ReplaceAll returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAll(src, repl []byte) []byte { - return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) -} - -// ReplaceAllFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched []byte). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { - lastMatchEnd := 0 // end position of the most recent match - searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, src, "", searchPos, 2) - if len(a) == 0 { - break // no more matches - } - - // Copy the unmatched characters before this match. - buf.Write(src[lastMatchEnd:a[0]]) - - // Now insert a copy of the replacement string, but not for a - // match of the empty string immediately after another match. - // (Otherwise, we get double replacement for patterns that - // match both empty and nonempty strings.) - if a[1] > lastMatchEnd || a[0] == 0 { - buf.Write(repl(src[a[0]:a[1]])) - } - lastMatchEnd = a[1] - - // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRune(src[searchPos:]) - if searchPos+width > a[1] { - searchPos += width - } else if searchPos+1 > a[1] { - // This clause is only needed at the end of the input - // string. In that case, DecodeRuneInString returns width=0. - searchPos++ - } else { - searchPos = a[1] - } + n := 2 + if bytes.IndexByte(repl, '$') >= 0 { + n = 2 * (re.numSubexp + 1) } + srepl := "" + b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { + if len(srepl) != len(repl) { + srepl = string(repl) + } + return re.expand(dst, srepl, src, "", match) + }) + return b +} - // Copy the unmatched characters after the last match. - buf.Write(src[lastMatchEnd:]) +// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement bytes repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + }) +} - return buf.Bytes() +// ReplaceAllFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched byte slice. The replacement returned by repl is substituted +// directly, without using Expand. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) } var specialBytes = []byte(`\.+*?()|[]{}^$`) @@ -648,7 +676,7 @@ func (re *Regexp) FindString(s string) string { // location of the leftmost match in s of the regular expression. The match // itself is at s[loc[0]:loc[1]]. // A return value of nil indicates no match. -func (re *Regexp) FindStringIndex(s string) []int { +func (re *Regexp) FindStringIndex(s string) (loc []int) { a := re.doExecute(nil, nil, s, 0, 2) if a == nil { return nil @@ -660,7 +688,7 @@ func (re *Regexp) FindStringIndex(s string) []int { // location of the leftmost match of the regular expression in text read from // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return // value of nil indicates no match. -func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { +func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { a := re.doExecute(r, nil, "", 0, 2) if a == nil { return nil @@ -687,6 +715,134 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { return ret } +// Expand appends template to dst and returns the result; during the +// append, Expand replaces variables in the template with corresponding +// matches drawn from src. The match slice should have been returned by +// FindSubmatchIndex. +// +// In the template, a variable is denoted by a substring of the form +// $name or ${name}, where name is a non-empty sequence of letters, +// digits, and underscores. A purely numeric name like $1 refers to +// the submatch with the corresponding index; other names refer to +// capturing parentheses named with the (?P<name>...) syntax. A +// reference to an out of range or unmatched index or a name that is not +// present in the regular expression is replaced with an empty string. +// +// In the $name form, name is taken to be as long as possible: $1x is +// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. +// +// To insert a literal $ in the output, use $$ in the template. +func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { + return re.expand(dst, string(template), src, "", match) +} + +// ExpandString is like Expand but the template and source are strings. +// It appends to and returns a byte slice in order to give the calling +// code control over allocation. +func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { + return re.expand(dst, template, nil, src, match) +} + +func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { + for len(template) > 0 { + i := strings.Index(template, "$") + if i < 0 { + break + } + dst = append(dst, template[:i]...) + template = template[i:] + if len(template) > 1 && template[1] == '$' { + // Treat $$ as $. + dst = append(dst, '$') + template = template[2:] + continue + } + name, num, rest, ok := extract(template) + if !ok { + // Malformed; treat $ as raw text. + dst = append(dst, '$') + template = template[1:] + continue + } + template = rest + if num >= 0 { + if 2*num+1 < len(match) { + if bsrc != nil { + dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) + } else { + dst = append(dst, src[match[2*num]:match[2*num+1]]...) + } + } + } else { + for i, namei := range re.subexpNames { + if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { + if bsrc != nil { + dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) + } else { + dst = append(dst, src[match[2*i]:match[2*i+1]]...) + } + break + } + } + } + } + dst = append(dst, template...) + return dst +} + +// extract returns the name from a leading "$name" or "${name}" in str. +// If it is a number, extract returns num set to that number; otherwise num = -1. +func extract(str string) (name string, num int, rest string, ok bool) { + if len(str) < 2 || str[0] != '$' { + return + } + brace := false + if str[1] == '{' { + brace = true + str = str[2:] + } else { + str = str[1:] + } + i := 0 + for i < len(str) { + rune, size := utf8.DecodeRuneInString(str[i:]) + if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { + break + } + i += size + } + if i == 0 { + // empty name is not okay + return + } + name = str[:i] + if brace { + if i >= len(str) || str[i] != '}' { + // missing closing brace + return + } + i++ + } + + // Parse number. + num = 0 + for i := 0; i < len(name); i++ { + if name[i] < '0' || '9' < name[i] || num >= 1e8 { + num = -1 + break + } + num = num*10 + int(name[i]) - '0' + } + // Disallow leading zeros. + if name[0] == '0' && len(name) > 1 { + num = -1 + } + + rest = str[i:] + ok = true + return +} + // FindSubmatchIndex returns a slice holding the index pairs identifying the // leftmost match of the regular expression in b and the matches, if any, of // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions |