diff options
Diffstat (limited to 'libgo/go/strings')
-rw-r--r-- | libgo/go/strings/export_test.go | 36 | ||||
-rw-r--r-- | libgo/go/strings/replace.go | 416 | ||||
-rw-r--r-- | libgo/go/strings/replace_test.go | 437 | ||||
-rw-r--r-- | libgo/go/strings/search.go | 124 | ||||
-rw-r--r-- | libgo/go/strings/search_test.go | 90 | ||||
-rw-r--r-- | libgo/go/strings/strings_test.go | 44 |
6 files changed, 999 insertions, 148 deletions
diff --git a/libgo/go/strings/export_test.go b/libgo/go/strings/export_test.go index dcfec51..17c806a 100644 --- a/libgo/go/strings/export_test.go +++ b/libgo/go/strings/export_test.go @@ -7,3 +7,39 @@ package strings func (r *Replacer) Replacer() interface{} { return r.r } + +func (r *Replacer) PrintTrie() string { + gen := r.r.(*genericReplacer) + return gen.printNode(&gen.root, 0) +} + +func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) { + if t.priority > 0 { + s += "+" + } else { + s += "-" + } + s += "\n" + + if t.prefix != "" { + s += Repeat(".", depth) + t.prefix + s += r.printNode(t.next, depth+len(t.prefix)) + } else if t.table != nil { + for b, m := range r.mapping { + if int(m) != r.tableSize && t.table[m] != nil { + s += Repeat(".", depth) + string([]byte{byte(b)}) + s += r.printNode(t.table[m], depth+1) + } + } + } + return +} + +func StringFind(pattern, text string) int { + return makeStringFinder(pattern).next(text) +} + +func DumpTables(pattern string) ([]int, []int) { + finder := makeStringFinder(pattern) + return finder.badCharSkip[:], finder.goodSuffixSkip +} diff --git a/libgo/go/strings/replace.go b/libgo/go/strings/replace.go index f53a96e..f63b179 100644 --- a/libgo/go/strings/replace.go +++ b/libgo/go/strings/replace.go @@ -33,47 +33,45 @@ func NewReplacer(oldnew ...string) *Replacer { panic("strings.NewReplacer: odd argument count") } - // Possible implementations. - var ( - bb byteReplacer - bs byteStringReplacer - gen genericReplacer - ) - - allOldBytes, allNewBytes := true, true - for len(oldnew) > 0 { - old, new := oldnew[0], oldnew[1] - oldnew = oldnew[2:] - if len(old) != 1 { - allOldBytes = false + if len(oldnew) == 2 && len(oldnew[0]) > 1 { + return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])} + } + + allNewBytes := true + for i := 0; i < len(oldnew); i += 2 { + if len(oldnew[i]) != 1 { + return &Replacer{r: makeGenericReplacer(oldnew)} } - if len(new) != 1 { + if len(oldnew[i+1]) != 1 { allNewBytes = false } + } - // generic - gen.p = append(gen.p, pair{old, new}) - - // byte -> string - if allOldBytes { - bs.old.set(old[0]) - bs.new[old[0]] = []byte(new) - } - - // byte -> byte - if allOldBytes && allNewBytes { - bb.old.set(old[0]) - bb.new[old[0]] = new[0] + if allNewBytes { + bb := &byteReplacer{} + for i := 0; i < len(oldnew); i += 2 { + o, n := oldnew[i][0], oldnew[i+1][0] + if bb.old[o>>5]&uint32(1<<(o&31)) != 0 { + // Later old->new maps do not override previous ones with the same old string. + continue + } + bb.old.set(o) + bb.new[o] = n } + return &Replacer{r: bb} } - if allOldBytes && allNewBytes { - return &Replacer{r: &bb} - } - if allOldBytes { - return &Replacer{r: &bs} + bs := &byteStringReplacer{} + for i := 0; i < len(oldnew); i += 2 { + o, new := oldnew[i][0], oldnew[i+1] + if bs.old[o>>5]&uint32(1<<(o&31)) != 0 { + // Later old->new maps do not override previous ones with the same old string. + continue + } + bs.old.set(o) + bs.new[o] = []byte(new) } - return &Replacer{r: &gen} + return &Replacer{r: bs} } // Replace returns a copy of s with all replacements performed. @@ -86,79 +84,326 @@ func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { return r.r.WriteString(w, s) } -// genericReplacer is the fully generic (and least optimized) algorithm. +// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys +// and values may be empty. For example, the trie containing keys "ax", "ay", +// "bcbc", "x" and "xy" could have eight nodes: +// +// n0 - +// n1 a- +// n2 .x+ +// n3 .y+ +// n4 b- +// n5 .cbc+ +// n6 x+ +// n7 .y+ +// +// n0 is the root node, and its children are n1, n4 and n6; n1's children are +// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked +// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7 +// (marked with a trailing "+") are complete keys. +type trieNode struct { + // value is the value of the trie node's key/value pair. It is empty if + // this node is not a complete key. + value string + // priority is the priority (higher is more important) of the trie node's + // key/value pair; keys are not necessarily matched shortest- or longest- + // first. Priority is positive if this node is a complete key, and zero + // otherwise. In the example above, positive/zero priorities are marked + // with a trailing "+" or "-". + priority int + + // A trie node may have zero, one or more child nodes: + // * if the remaining fields are zero, there are no children. + // * if prefix and next are non-zero, there is one child in next. + // * if table is non-zero, it defines all the children. + // + // Prefixes are preferred over tables when there is one child, but the + // root node always uses a table for lookup efficiency. + + // prefix is the difference in keys between this trie node and the next. + // In the example above, node n4 has prefix "cbc" and n4's next node is n5. + // Node n5 has no children and so has zero prefix, next and table fields. + prefix string + next *trieNode + + // table is a lookup table indexed by the next byte in the key, after + // remapping that byte through genericReplacer.mapping to create a dense + // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and + // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and + // genericReplacer.tableSize will be 5. Node n0's table will be + // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped + // 'a', 'b' and 'x'. + table []*trieNode +} + +func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { + if key == "" { + if t.priority == 0 { + t.value = val + t.priority = priority + } + return + } + + if t.prefix != "" { + // Need to split the prefix among multiple nodes. + var n int // length of the longest common prefix + for ; n < len(t.prefix) && n < len(key); n++ { + if t.prefix[n] != key[n] { + break + } + } + if n == len(t.prefix) { + t.next.add(key[n:], val, priority, r) + } else if n == 0 { + // First byte differs, start a new lookup table here. Looking up + // what is currently t.prefix[0] will lead to prefixNode, and + // looking up key[0] will lead to keyNode. + var prefixNode *trieNode + if len(t.prefix) == 1 { + prefixNode = t.next + } else { + prefixNode = &trieNode{ + prefix: t.prefix[1:], + next: t.next, + } + } + keyNode := new(trieNode) + t.table = make([]*trieNode, r.tableSize) + t.table[r.mapping[t.prefix[0]]] = prefixNode + t.table[r.mapping[key[0]]] = keyNode + t.prefix = "" + t.next = nil + keyNode.add(key[1:], val, priority, r) + } else { + // Insert new node after the common section of the prefix. + next := &trieNode{ + prefix: t.prefix[n:], + next: t.next, + } + t.prefix = t.prefix[:n] + t.next = next + next.add(key[n:], val, priority, r) + } + } else if t.table != nil { + // Insert into existing table. + m := r.mapping[key[0]] + if t.table[m] == nil { + t.table[m] = new(trieNode) + } + t.table[m].add(key[1:], val, priority, r) + } else { + t.prefix = key + t.next = new(trieNode) + t.next.add("", val, priority, r) + } +} + +func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) { + // Iterate down the trie to the end, and grab the value and keylen with + // the highest priority. + bestPriority := 0 + node := &r.root + n := 0 + for node != nil { + if node.priority > bestPriority && !(ignoreRoot && node == &r.root) { + bestPriority = node.priority + val = node.value + keylen = n + found = true + } + + if s == "" { + break + } + if node.table != nil { + index := r.mapping[s[0]] + if int(index) == r.tableSize { + break + } + node = node.table[index] + s = s[1:] + n++ + } else if node.prefix != "" && HasPrefix(s, node.prefix) { + n += len(node.prefix) + s = s[len(node.prefix):] + node = node.next + } else { + break + } + } + return +} + +// genericReplacer is the fully generic algorithm. // It's used as a fallback when nothing faster can be used. type genericReplacer struct { - p []pair + root trieNode + // tableSize is the size of a trie node's lookup table. It is the number + // of unique key bytes. + tableSize int + // mapping maps from key bytes to a dense index for trieNode.table. + mapping [256]byte } -type pair struct{ old, new string } +func makeGenericReplacer(oldnew []string) *genericReplacer { + r := new(genericReplacer) + // Find each byte used, then assign them each an index. + for i := 0; i < len(oldnew); i += 2 { + key := oldnew[i] + for j := 0; j < len(key); j++ { + r.mapping[key[j]] = 1 + } + } + + for _, b := range r.mapping { + r.tableSize += int(b) + } + + var index byte + for i, b := range r.mapping { + if b == 0 { + r.mapping[i] = byte(r.tableSize) + } else { + r.mapping[i] = index + index++ + } + } + // Ensure root node uses a lookup table (for performance). + r.root.table = make([]*trieNode, r.tableSize) -type appendSliceWriter struct { - b []byte + for i := 0; i < len(oldnew); i += 2 { + r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) + } + return r } +type appendSliceWriter []byte + +// Write writes to the buffer to satisfy io.Writer. func (w *appendSliceWriter) Write(p []byte) (int, error) { - w.b = append(w.b, p...) + *w = append(*w, p...) return len(p), nil } +// WriteString writes to the buffer without string->[]byte->string allocations. +func (w *appendSliceWriter) WriteString(s string) (int, error) { + *w = append(*w, s...) + return len(s), nil +} + +type stringWriterIface interface { + WriteString(string) (int, error) +} + +type stringWriter struct { + w io.Writer +} + +func (w stringWriter) WriteString(s string) (int, error) { + return w.w.Write([]byte(s)) +} + +func getStringWriter(w io.Writer) stringWriterIface { + sw, ok := w.(stringWriterIface) + if !ok { + sw = stringWriter{w} + } + return sw +} + func (r *genericReplacer) Replace(s string) string { - // TODO(bradfitz): optimized version - n, _ := r.WriteString(discard, s) - w := appendSliceWriter{make([]byte, 0, n)} - r.WriteString(&w, s) - return string(w.b) + buf := make(appendSliceWriter, 0, len(s)) + r.WriteString(&buf, s) + return string(buf) } func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { - lastEmpty := false // the last replacement was of the empty string -Input: - // TODO(bradfitz): optimized version - for i := 0; i < len(s); { - for _, p := range r.p { - if p.old == "" && lastEmpty { - // Don't let old match twice in a row. - // (it doesn't advance the input and - // would otherwise loop forever) - continue + sw := getStringWriter(w) + var last, wn int + var prevMatchEmpty bool + for i := 0; i <= len(s); { + // Ignore the empty match iff the previous loop found the empty match. + val, keylen, match := r.lookup(s[i:], prevMatchEmpty) + prevMatchEmpty = match && keylen == 0 + if match { + wn, err = sw.WriteString(s[last:i]) + n += wn + if err != nil { + return } - if HasPrefix(s[i:], p.old) { - if p.new != "" { - wn, err := w.Write([]byte(p.new)) - n += wn - if err != nil { - return n, err - } - } - i += len(p.old) - lastEmpty = p.old == "" - continue Input + wn, err = sw.WriteString(val) + n += wn + if err != nil { + return } - } - wn, err := w.Write([]byte{s[i]}) - n += wn - if err != nil { - return n, err + i += keylen + last = i + continue } i++ } + if last != len(s) { + wn, err = sw.WriteString(s[last:]) + n += wn + } + return +} - // Final empty match at end. - for _, p := range r.p { - if p.old == "" { - if p.new != "" { - wn, err := w.Write([]byte(p.new)) - n += wn - if err != nil { - return n, err - } - } +// singleStringReplacer is the implementation that's used when there is only +// one string to replace (and that string has more than one byte). +type singleStringReplacer struct { + finder *stringFinder + // value is the new string that replaces that pattern when it's found. + value string +} + +func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer { + return &singleStringReplacer{finder: makeStringFinder(pattern), value: value} +} + +func (r *singleStringReplacer) Replace(s string) string { + var buf []byte + i := 0 + for { + match := r.finder.next(s[i:]) + if match == -1 { break } + buf = append(buf, s[i:i+match]...) + buf = append(buf, r.value...) + i += match + len(r.finder.pattern) + } + if buf == nil { + return s } + buf = append(buf, s[i:]...) + return string(buf) +} - return n, nil +func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var i, wn int + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + wn, err = sw.WriteString(s[i : i+match]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(r.value) + n += wn + if err != nil { + return + } + i += match + len(r.finder.pattern) + } + wn, err = sw.WriteString(s[i:]) + n += wn + return } // byteReplacer is the implementation that's used when all the "old" @@ -301,12 +546,3 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro } return n, nil } - -// strings is too low-level to import io/ioutil -var discard io.Writer = devNull(0) - -type devNull int - -func (devNull) Write(p []byte) (int, error) { - return len(p), nil -} diff --git a/libgo/go/strings/replace_test.go b/libgo/go/strings/replace_test.go index 23c7e2e..d33dea9 100644 --- a/libgo/go/strings/replace_test.go +++ b/libgo/go/strings/replace_test.go @@ -7,105 +7,390 @@ package strings_test import ( "bytes" "fmt" - "log" . "strings" "testing" ) -var _ = log.Printf - -type ReplacerTest struct { - r *Replacer - in string - out string -} +var htmlEscaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", +) -var htmlEscaper = NewReplacer("&", "&", "<", "<", ">", ">", "\"", """) +var htmlUnescaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + """, `"`, + "'", "'", +) // The http package's old HTML escaping function. -func oldhtmlEscape(s string) string { +func oldHTMLEscape(s string) string { s = Replace(s, "&", "&", -1) s = Replace(s, "<", "<", -1) s = Replace(s, ">", ">", -1) - s = Replace(s, "\"", """, -1) + s = Replace(s, `"`, """, -1) s = Replace(s, "'", "'", -1) return s } -var replacer = NewReplacer("aaa", "3[aaa]", "aa", "2[aa]", "a", "1[a]", "i", "i", - "longerst", "most long", "longer", "medium", "long", "short", - "X", "Y", "Y", "Z") - var capitalLetters = NewReplacer("a", "A", "b", "B") -var blankToXReplacer = NewReplacer("", "X", "o", "O") +// TestReplacer tests the replacer implementations. +func TestReplacer(t *testing.T) { + type testCase struct { + r *Replacer + in, out string + } + var testCases []testCase -var ReplacerTests = []ReplacerTest{ - // byte->string - {htmlEscaper, "No changes", "No changes"}, - {htmlEscaper, "I <3 escaping & stuff", "I <3 escaping & stuff"}, - {htmlEscaper, "&&&", "&&&"}, + // str converts 0xff to "\xff". This isn't just string(b) since that converts to UTF-8. + str := func(b byte) string { + return string([]byte{b}) + } + var s []string - // generic - {replacer, "fooaaabar", "foo3[aaa]b1[a]r"}, - {replacer, "long, longerst, longer", "short, most long, medium"}, - {replacer, "XiX", "YiY"}, + // inc maps "\x00"->"\x01", ..., "a"->"b", "b"->"c", ..., "\xff"->"\x00". + s = nil + for i := 0; i < 256; i++ { + s = append(s, str(byte(i)), str(byte(i+1))) + } + inc := NewReplacer(s...) - // byte->byte - {capitalLetters, "brad", "BrAd"}, - {capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)}, + // Test cases with 1-byte old strings, 1-byte new strings. + testCases = append(testCases, + testCase{capitalLetters, "brad", "BrAd"}, + testCase{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)}, + testCase{capitalLetters, "", ""}, - // hitting "" special case - {blankToXReplacer, "oo", "XOXOX"}, -} + testCase{inc, "brad", "csbe"}, + testCase{inc, "\x00\xff", "\x01\x00"}, + testCase{inc, "", ""}, -func TestReplacer(t *testing.T) { - for i, tt := range ReplacerTests { - if s := tt.r.Replace(tt.in); s != tt.out { - t.Errorf("%d. Replace(%q) = %q, want %q", i, tt.in, s, tt.out) + testCase{NewReplacer("a", "1", "a", "2"), "brad", "br1d"}, + ) + + // repeat maps "a"->"a", "b"->"bb", "c"->"ccc", ... + s = nil + for i := 0; i < 256; i++ { + n := i + 1 - 'a' + if n < 1 { + n = 1 + } + s = append(s, str(byte(i)), Repeat(str(byte(i)), n)) + } + repeat := NewReplacer(s...) + + // Test cases with 1-byte old strings, variable length new strings. + testCases = append(testCases, + testCase{htmlEscaper, "No changes", "No changes"}, + testCase{htmlEscaper, "I <3 escaping & stuff", "I <3 escaping & stuff"}, + testCase{htmlEscaper, "&&&", "&&&"}, + testCase{htmlEscaper, "", ""}, + + testCase{repeat, "brad", "bbrrrrrrrrrrrrrrrrrradddd"}, + testCase{repeat, "abba", "abbbba"}, + testCase{repeat, "", ""}, + + testCase{NewReplacer("a", "11", "a", "22"), "brad", "br11d"}, + ) + + // The remaining test cases have variable length old strings. + + testCases = append(testCases, + testCase{htmlUnescaper, "&amp;", "&"}, + testCase{htmlUnescaper, "<b>HTML's neat</b>", "<b>HTML's neat</b>"}, + testCase{htmlUnescaper, "", ""}, + + testCase{NewReplacer("a", "1", "a", "2", "xxx", "xxx"), "brad", "br1d"}, + + testCase{NewReplacer("a", "1", "aa", "2", "aaa", "3"), "aaaa", "1111"}, + + testCase{NewReplacer("aaa", "3", "aa", "2", "a", "1"), "aaaa", "31"}, + ) + + // gen1 has multiple old strings of variable length. There is no + // overall non-empty common prefix, but some pairwise common prefixes. + gen1 := NewReplacer( + "aaa", "3[aaa]", + "aa", "2[aa]", + "a", "1[a]", + "i", "i", + "longerst", "most long", + "longer", "medium", + "long", "short", + "xx", "xx", + "x", "X", + "X", "Y", + "Y", "Z", + ) + testCases = append(testCases, + testCase{gen1, "fooaaabar", "foo3[aaa]b1[a]r"}, + testCase{gen1, "long, longerst, longer", "short, most long, medium"}, + testCase{gen1, "xxxxx", "xxxxX"}, + testCase{gen1, "XiX", "YiY"}, + testCase{gen1, "", ""}, + ) + + // gen2 has multiple old strings with no pairwise common prefix. + gen2 := NewReplacer( + "roses", "red", + "violets", "blue", + "sugar", "sweet", + ) + testCases = append(testCases, + testCase{gen2, "roses are red, violets are blue...", "red are red, blue are blue..."}, + testCase{gen2, "", ""}, + ) + + // gen3 has multiple old strings with an overall common prefix. + gen3 := NewReplacer( + "abracadabra", "poof", + "abracadabrakazam", "splat", + "abraham", "lincoln", + "abrasion", "scrape", + "abraham", "isaac", + ) + testCases = append(testCases, + testCase{gen3, "abracadabrakazam abraham", "poofkazam lincoln"}, + testCase{gen3, "abrasion abracad", "scrape abracad"}, + testCase{gen3, "abba abram abrasive", "abba abram abrasive"}, + testCase{gen3, "", ""}, + ) + + // foo{1,2,3,4} have multiple old strings with an overall common prefix + // and 1- or 2- byte extensions from the common prefix. + foo1 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo3", "C", + ) + foo2 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo31", "C", + "foo32", "D", + ) + foo3 := NewReplacer( + "foo11", "A", + "foo12", "B", + "foo31", "C", + "foo32", "D", + ) + foo4 := NewReplacer( + "foo12", "B", + "foo32", "D", + ) + testCases = append(testCases, + testCase{foo1, "fofoofoo12foo32oo", "fofooA2C2oo"}, + testCase{foo1, "", ""}, + + testCase{foo2, "fofoofoo12foo32oo", "fofooA2Doo"}, + testCase{foo2, "", ""}, + + testCase{foo3, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo3, "", ""}, + + testCase{foo4, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo4, "", ""}, + ) + + // genAll maps "\x00\x01\x02...\xfe\xff" to "[all]", amongst other things. + allBytes := make([]byte, 256) + for i := range allBytes { + allBytes[i] = byte(i) + } + allString := string(allBytes) + genAll := NewReplacer( + allString, "[all]", + "\xff", "[ff]", + "\x00", "[00]", + ) + testCases = append(testCases, + testCase{genAll, allString, "[all]"}, + testCase{genAll, "a\xff" + allString + "\x00", "a[ff][all][00]"}, + testCase{genAll, "", ""}, + ) + + // Test cases with empty old strings. + + blankToX1 := NewReplacer("", "X") + blankToX2 := NewReplacer("", "X", "", "") + blankHighPriority := NewReplacer("", "X", "o", "O") + blankLowPriority := NewReplacer("o", "O", "", "X") + blankNoOp1 := NewReplacer("", "") + blankNoOp2 := NewReplacer("", "", "", "A") + blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z") + testCases = append(testCases, + testCase{blankToX1, "foo", "XfXoXoX"}, + testCase{blankToX1, "", "X"}, + + testCase{blankToX2, "foo", "XfXoXoX"}, + testCase{blankToX2, "", "X"}, + + testCase{blankHighPriority, "oo", "XOXOX"}, + testCase{blankHighPriority, "ii", "XiXiX"}, + testCase{blankHighPriority, "oiio", "XOXiXiXOX"}, + testCase{blankHighPriority, "iooi", "XiXOXOXiX"}, + testCase{blankHighPriority, "", "X"}, + + testCase{blankLowPriority, "oo", "OOX"}, + testCase{blankLowPriority, "ii", "XiXiX"}, + testCase{blankLowPriority, "oiio", "OXiXiOX"}, + testCase{blankLowPriority, "iooi", "XiOOXiX"}, + testCase{blankLowPriority, "", "X"}, + + testCase{blankNoOp1, "foo", "foo"}, + testCase{blankNoOp1, "", ""}, + + testCase{blankNoOp2, "foo", "foo"}, + testCase{blankNoOp2, "", ""}, + + testCase{blankFoo, "foobarfoobaz", "XRXZX"}, + testCase{blankFoo, "foobar-foobaz", "XRX-XZX"}, + testCase{blankFoo, "", "X"}, + ) + + // single string replacer + + abcMatcher := NewReplacer("abc", "[match]") + + testCases = append(testCases, + testCase{abcMatcher, "", ""}, + testCase{abcMatcher, "ab", "ab"}, + testCase{abcMatcher, "abcd", "[match]d"}, + testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"}, + ) + + // No-arg test cases. + + nop := NewReplacer() + testCases = append(testCases, + testCase{nop, "abc", "abc"}, + testCase{nop, "", ""}, + ) + + // Run the test cases. + + for i, tc := range testCases { + if s := tc.r.Replace(tc.in); s != tc.out { + t.Errorf("%d. Replace(%q) = %q, want %q", i, tc.in, s, tc.out) } var buf bytes.Buffer - n, err := tt.r.WriteString(&buf, tt.in) + n, err := tc.r.WriteString(&buf, tc.in) if err != nil { t.Errorf("%d. WriteString: %v", i, err) continue } got := buf.String() - if got != tt.out { - t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tt.in, got, tt.out) + if got != tc.out { + t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tc.in, got, tc.out) continue } - if n != len(tt.out) { + if n != len(tc.out) { t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)", - i, tt.in, n, len(tt.out), tt.out) + i, tc.in, n, len(tc.out), tc.out) } } } -// pickAlgorithmTest is a test that verifies that given input for a -// Replacer that we pick the correct algorithm. -type pickAlgorithmTest struct { - r *Replacer - want string // name of algorithm +// TestPickAlgorithm tests that NewReplacer picks the correct algorithm. +func TestPickAlgorithm(t *testing.T) { + testCases := []struct { + r *Replacer + want string + }{ + {capitalLetters, "*strings.byteReplacer"}, + {htmlEscaper, "*strings.byteStringReplacer"}, + {NewReplacer("12", "123"), "*strings.singleStringReplacer"}, + {NewReplacer("1", "12"), "*strings.byteStringReplacer"}, + {NewReplacer("", "X"), "*strings.genericReplacer"}, + {NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"}, + } + for i, tc := range testCases { + got := fmt.Sprintf("%T", tc.r.Replacer()) + if got != tc.want { + t.Errorf("%d. algorithm = %s, want %s", i, got, tc.want) + } + } } -var pickAlgorithmTests = []pickAlgorithmTest{ - {capitalLetters, "*strings.byteReplacer"}, - {NewReplacer("12", "123"), "*strings.genericReplacer"}, - {NewReplacer("1", "12"), "*strings.byteStringReplacer"}, - {htmlEscaper, "*strings.byteStringReplacer"}, -} +// TestGenericTrieBuilding verifies the structure of the generated trie. There +// is one node per line, and the key ending with the current line is in the +// trie if it ends with a "+". +func TestGenericTrieBuilding(t *testing.T) { + testCases := []struct{ in, out string }{ + {"abc;abdef;abdefgh;xx;xy;z", `- + a- + .b- + ..c+ + ..d- + ...ef+ + .....gh+ + x- + .x+ + .y+ + z+ + `}, + {"abracadabra;abracadabrakazam;abraham;abrasion", `- + a- + .bra- + ....c- + .....adabra+ + ...........kazam+ + ....h- + .....am+ + ....s- + .....ion+ + `}, + {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `- + X+ + Y+ + a+ + .a+ + ..a+ + i+ + l- + .ong+ + ....er+ + ......st+ + x+ + .x+ + `}, + {"foo;;foo;foo1", `+ + f- + .oo+ + ...1+ + `}, + } -func TestPickAlgorithm(t *testing.T) { - for i, tt := range pickAlgorithmTests { - got := fmt.Sprintf("%T", tt.r.Replacer()) - if got != tt.want { - t.Errorf("%d. algorithm = %s, want %s", i, got, tt.want) + for _, tc := range testCases { + keys := Split(tc.in, ";") + args := make([]string, len(keys)*2) + for i, key := range keys { + args[i*2] = key + } + + got := NewReplacer(args...).PrintTrie() + // Remove tabs from tc.out + wantbuf := make([]byte, 0, len(tc.out)) + for i := 0; i < len(tc.out); i++ { + if tc.out[i] != '\t' { + wantbuf = append(wantbuf, tc.out[i]) + } + } + want := string(wantbuf) + + if got != want { + t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want) } } } -func BenchmarkGenericMatch(b *testing.B) { +func BenchmarkGenericNoMatch(b *testing.B) { str := Repeat("A", 100) + Repeat("B", 100) generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic for i := 0; i < b.N; i++ { @@ -113,6 +398,42 @@ func BenchmarkGenericMatch(b *testing.B) { } } +func BenchmarkGenericMatch1(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + generic := NewReplacer("a", "A", "b", "B", "12", "123") + for i := 0; i < b.N; i++ { + generic.Replace(str) + } +} + +func BenchmarkGenericMatch2(b *testing.B) { + str := Repeat("It's <b>HTML</b>!", 100) + for i := 0; i < b.N; i++ { + htmlUnescaper.Replace(str) + } +} + +func benchmarkSingleString(b *testing.B, pattern, text string) { + r := NewReplacer(pattern, "[match]") + b.SetBytes(int64(len(text))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.Replace(text) + } +} + +func BenchmarkSingleMaxSkipping(b *testing.B) { + benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000)) +} + +func BenchmarkSingleLongSuffixFail(b *testing.B) { + benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002)) +} + +func BenchmarkSingleMatch(b *testing.B) { + benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000)) +} + func BenchmarkByteByteNoMatch(b *testing.B) { str := Repeat("A", 100) + Repeat("B", 100) for i := 0; i < b.N; i++ { @@ -144,7 +465,7 @@ func BenchmarkHTMLEscapeNew(b *testing.B) { func BenchmarkHTMLEscapeOld(b *testing.B) { str := "I <3 to escape HTML & other text too." for i := 0; i < b.N; i++ { - oldhtmlEscape(str) + oldHTMLEscape(str) } } diff --git a/libgo/go/strings/search.go b/libgo/go/strings/search.go new file mode 100644 index 0000000..f77c879 --- /dev/null +++ b/libgo/go/strings/search.go @@ -0,0 +1,124 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// stringFinder efficiently finds strings in a source text. It's implemented +// using the Boyer-Moore string search algorithm: +// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm +// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged +// document uses 1-based indexing) +type stringFinder struct { + // pattern is the string that we are searching for in the text. + pattern string + + // badCharSkip[b] contains the distance between the last byte of pattern + // and the rightmost occurrence of b in pattern. If b is not in pattern, + // badCharSkip[b] is len(pattern). + // + // Whenever a mismatch is found with byte b in the text, we can safely + // shift the matching frame at least badCharSkip[b] until the next time + // the matching char could be in alignment. + badCharSkip [256]int + + // goodSuffixSkip[i] defines how far we can shift the matching frame given + // that the suffix pattern[i+1:] matches, but the byte pattern[i] does + // not. There are two cases to consider: + // + // 1. The matched suffix occurs elsewhere in pattern (with a different + // byte preceding it that we might possibly match). In this case, we can + // shift the matching frame to align with the next suffix chunk. For + // example, the pattern "mississi" has the suffix "issi" next occurring + // (in right-to-left order) at index 1, so goodSuffixSkip[3] == + // shift+len(suffix) == 3+4 == 7. + // + // 2. If the matched suffix does not occur elsewhere in pattern, then the + // matching frame may share part of its prefix with the end of the + // matching suffix. In this case, goodSuffixSkip[i] will contain how far + // to shift the frame to align this portion of the prefix to the + // suffix. For example, in the pattern "abcxxxabc", when the first + // mismatch from the back is found to be in position 3, the matching + // suffix "xxabc" is not found elsewhere in the pattern. However, its + // rightmost "abc" (at position 6) is a prefix of the whole pattern, so + // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. + goodSuffixSkip []int +} + +func makeStringFinder(pattern string) *stringFinder { + f := &stringFinder{ + pattern: pattern, + goodSuffixSkip: make([]int, len(pattern)), + } + // last is the index of the last character in the pattern. + last := len(pattern) - 1 + + // Build bad character table. + // Bytes not in the pattern can skip one pattern's length. + for i := range f.badCharSkip { + f.badCharSkip[i] = len(pattern) + } + // The loop condition is < instead of <= so that the last byte does not + // have a zero distance to itself. Finding this byte out of place implies + // that it is not in the last position. + for i := 0; i < last; i++ { + f.badCharSkip[pattern[i]] = last - i + } + + // Build good suffix table. + // First pass: set each value to the next index which starts a prefix of + // pattern. + lastPrefix := last + for i := last; i >= 0; i-- { + if HasPrefix(pattern, pattern[i+1:]) { + lastPrefix = i + 1 + } + // lastPrefix is the shift, and (last-i) is len(suffix). + f.goodSuffixSkip[i] = lastPrefix + last - i + } + // Second pass: find repeats of pattern's suffix starting from the front. + for i := 0; i < last; i++ { + lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) + if pattern[i-lenSuffix] != pattern[last-lenSuffix] { + // (last-i) is the shift, and lenSuffix is len(suffix). + f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i + } + } + + return f +} + +func longestCommonSuffix(a, b string) (i int) { + for ; i < len(a) && i < len(b); i++ { + if a[len(a)-1-i] != b[len(b)-1-i] { + break + } + } + return +} + +// next returns the index in text of the first occurrence of the pattern. If +// the pattern is not found, it returns -1. +func (f *stringFinder) next(text string) int { + i := len(f.pattern) - 1 + for i < len(text) { + // Compare backwards from the end until the first unmatching character. + j := len(f.pattern) - 1 + for j >= 0 && text[i] == f.pattern[j] { + i-- + j-- + } + if j < 0 { + return i + 1 // match + } + i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) + } + return -1 +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/libgo/go/strings/search_test.go b/libgo/go/strings/search_test.go new file mode 100644 index 0000000..966c05e --- /dev/null +++ b/libgo/go/strings/search_test.go @@ -0,0 +1,90 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "reflect" + . "strings" + "testing" +) + +func TestFinderNext(t *testing.T) { + testCases := []struct { + pat, text string + index int + }{ + {"", "", 0}, + {"", "abc", 0}, + {"abc", "", -1}, + {"abc", "abc", 0}, + {"d", "abcdefg", 3}, + {"nan", "banana", 2}, + {"pan", "anpanman", 2}, + {"nnaaman", "anpanmanam", -1}, + {"abcd", "abc", -1}, + {"abcd", "bcd", -1}, + {"bcd", "abcd", 1}, + {"abc", "acca", -1}, + {"aa", "aaa", 0}, + {"baa", "aaaaa", -1}, + {"at that", "which finally halts. at that point", 22}, + } + + for _, tc := range testCases { + got := StringFind(tc.pat, tc.text) + want := tc.index + if got != want { + t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want) + } + } +} + +func TestFinderCreation(t *testing.T) { + testCases := []struct { + pattern string + bad [256]int + suf []int + }{ + { + "abc", + [256]int{'a': 2, 'b': 1, 'c': 3}, + []int{5, 4, 1}, + }, + { + "mississi", + [256]int{'i': 3, 'm': 7, 's': 1}, + []int{15, 14, 13, 7, 11, 10, 7, 1}, + }, + // From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf + { + "abcxxxabc", + [256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3}, + []int{14, 13, 12, 11, 10, 9, 11, 10, 1}, + }, + { + "abyxcdeyx", + [256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5}, + []int{17, 16, 15, 14, 13, 12, 7, 10, 1}, + }, + } + + for _, tc := range testCases { + bad, good := DumpTables(tc.pattern) + + for i, got := range bad { + want := tc.bad[i] + if want == 0 { + want = len(tc.pattern) + } + if got != want { + t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want) + } + } + + if !reflect.DeepEqual(good, tc.suf) { + t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf) + } + } +} diff --git a/libgo/go/strings/strings_test.go b/libgo/go/strings/strings_test.go index 54046d6..c271e48 100644 --- a/libgo/go/strings/strings_test.go +++ b/libgo/go/strings/strings_test.go @@ -7,6 +7,7 @@ package strings_test import ( "bytes" "io" + "math/rand" "reflect" . "strings" "testing" @@ -311,6 +312,13 @@ var FieldsFuncTests = []FieldsTest{ } func TestFieldsFunc(t *testing.T) { + for _, tt := range fieldstests { + a := FieldsFunc(tt.s, unicode.IsSpace) + if !eq(a, tt.a) { + t.Errorf("FieldsFunc(%q, unicode.IsSpace) = %v; want %v", tt.s, a, tt.a) + continue + } + } pred := func(c rune) bool { return c == 'X' } for _, tt := range FieldsFuncTests { a := FieldsFunc(tt.s, pred) @@ -984,3 +992,39 @@ func TestEqualFold(t *testing.T) { } } } + +var makeFieldsInput = func() string { + x := make([]byte, 1<<20) + // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space. + for i := range x { + switch rand.Intn(10) { + case 0: + x[i] = ' ' + case 1: + if i > 0 && x[i-1] == 'x' { + copy(x[i-1:], "χ") + break + } + fallthrough + default: + x[i] = 'x' + } + } + return string(x) +} + +var fieldsInput = makeFieldsInput() + +func BenchmarkFields(b *testing.B) { + b.SetBytes(int64(len(fieldsInput))) + for i := 0; i < b.N; i++ { + Fields(fieldsInput) + } +} + +func BenchmarkFieldsFunc(b *testing.B) { + b.SetBytes(int64(len(fieldsInput))) + for i := 0; i < b.N; i++ { + FieldsFunc(fieldsInput, unicode.IsSpace) + } +} |