diff options
Diffstat (limited to 'libgo/go/strings')
-rw-r--r-- | libgo/go/strings/example_test.go | 6 | ||||
-rw-r--r-- | libgo/go/strings/replace.go | 2 | ||||
-rw-r--r-- | libgo/go/strings/strings.go | 147 | ||||
-rw-r--r-- | libgo/go/strings/strings_test.go | 145 |
4 files changed, 253 insertions, 47 deletions
diff --git a/libgo/go/strings/example_test.go b/libgo/go/strings/example_test.go index e31054a..4f3a1ce 100644 --- a/libgo/go/strings/example_test.go +++ b/libgo/go/strings/example_test.go @@ -47,12 +47,16 @@ func ExampleContains() { func ExampleContainsAny() { fmt.Println(strings.ContainsAny("team", "i")) - fmt.Println(strings.ContainsAny("failure", "u & i")) + fmt.Println(strings.ContainsAny("fail", "ui")) + fmt.Println(strings.ContainsAny("ure", "ui")) + fmt.Println(strings.ContainsAny("failure", "ui")) fmt.Println(strings.ContainsAny("foo", "")) fmt.Println(strings.ContainsAny("", "")) // Output: // false // true + // true + // true // false // false } diff --git a/libgo/go/strings/replace.go b/libgo/go/strings/replace.go index ace0b8d..ccab1fb 100644 --- a/libgo/go/strings/replace.go +++ b/libgo/go/strings/replace.go @@ -26,6 +26,8 @@ type replacer interface { // NewReplacer returns a new Replacer from a list of old, new string // pairs. Replacements are performed in the order they appear in the // target string, without overlapping matches. +// +// NewReplacer panics if given an odd number of arguments. func NewReplacer(oldnew ...string) *Replacer { if len(oldnew)%2 == 1 { panic("strings.NewReplacer: odd argument count") diff --git a/libgo/go/strings/strings.go b/libgo/go/strings/strings.go index a98f5d8..7337481 100644 --- a/libgo/go/strings/strings.go +++ b/libgo/go/strings/strings.go @@ -341,38 +341,38 @@ func Fields(s string) []string { wasSpace = isSpace } - if setBits < utf8.RuneSelf { // ASCII fast path - a := make([]string, n) - na := 0 - fieldStart := 0 - i := 0 - // Skip spaces in the front of the input. - for i < len(s) && asciiSpace[s[i]] != 0 { + if setBits >= utf8.RuneSelf { + // Some runes in the input string are not ASCII. + return FieldsFunc(s, unicode.IsSpace) + } + // ASCII fast path + a := make([]string, n) + na := 0 + fieldStart := 0 + i := 0 + // Skip spaces in the front of the input. + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + fieldStart = i + for i < len(s) { + if asciiSpace[s[i]] == 0 { i++ + continue } - fieldStart = i - for i < len(s) { - if asciiSpace[s[i]] == 0 { - i++ - continue - } - a[na] = s[fieldStart:i] - na++ + a[na] = s[fieldStart:i] + na++ + i++ + // Skip spaces in between fields. + for i < len(s) && asciiSpace[s[i]] != 0 { i++ - // Skip spaces in between fields. - for i < len(s) && asciiSpace[s[i]] != 0 { - i++ - } - fieldStart = i } - if fieldStart < len(s) { // Last field might end at EOF. - a[na] = s[fieldStart:] - } - return a + fieldStart = i } - - // Some runes in the input string are not ASCII. - return FieldsFunc(s, unicode.IsSpace) + if fieldStart < len(s) { // Last field might end at EOF. + a[na] = s[fieldStart:] + } + return a } // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) @@ -550,7 +550,7 @@ func Repeat(s string, count int) string { return b.String() } -// ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case. +// ToUpper returns s with all Unicode letters mapped to their upper case. func ToUpper(s string) string { isASCII, hasLower := true, false for i := 0; i < len(s); i++ { @@ -559,7 +559,7 @@ func ToUpper(s string) string { isASCII = false break } - hasLower = hasLower || (c >= 'a' && c <= 'z') + hasLower = hasLower || ('a' <= c && c <= 'z') } if isASCII { // optimize for ASCII-only strings. @@ -570,7 +570,7 @@ func ToUpper(s string) string { b.Grow(len(s)) for i := 0; i < len(s); i++ { c := s[i] - if c >= 'a' && c <= 'z' { + if 'a' <= c && c <= 'z' { c -= 'a' - 'A' } b.WriteByte(c) @@ -580,7 +580,7 @@ func ToUpper(s string) string { return Map(unicode.ToUpper, s) } -// ToLower returns a copy of the string s with all Unicode letters mapped to their lower case. +// ToLower returns s with all Unicode letters mapped to their lower case. func ToLower(s string) string { isASCII, hasUpper := true, false for i := 0; i < len(s); i++ { @@ -589,7 +589,7 @@ func ToLower(s string) string { isASCII = false break } - hasUpper = hasUpper || (c >= 'A' && c <= 'Z') + hasUpper = hasUpper || ('A' <= c && c <= 'Z') } if isASCII { // optimize for ASCII-only strings. @@ -600,7 +600,7 @@ func ToLower(s string) string { b.Grow(len(s)) for i := 0; i < len(s); i++ { c := s[i] - if c >= 'A' && c <= 'Z' { + if 'A' <= c && c <= 'Z' { c += 'a' - 'A' } b.WriteByte(c) @@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string { return Map(c.ToTitle, s) } +// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences +// replaced by the replacement string, which may be empty. +func ToValidUTF8(s, replacement string) string { + var b Builder + + for i, c := range s { + if c != utf8.RuneError { + continue + } + + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + b.Grow(len(s) + len(replacement)) + b.WriteString(s[:i]) + s = s[i:] + break + } + } + + // Fast path for unchanged input + if b.Cap() == 0 { // didn't call b.Grow above + return s + } + + invalid := false // previous byte was from an invalid UTF-8 sequence + for i := 0; i < len(s); { + c := s[i] + if c < utf8.RuneSelf { + i++ + invalid = false + b.WriteByte(c) + continue + } + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + i++ + if !invalid { + invalid = true + b.WriteString(replacement) + } + continue + } + invalid = false + b.WriteString(s[i : i+wid]) + i += wid + } + + return b.String() +} + // isSeparator reports whether the rune could mark a word boundary. // TODO: update when package unicode captures more of the properties. func isSeparator(r rune) bool { @@ -818,7 +868,36 @@ func TrimRight(s string, cutset string) string { // TrimSpace returns a slice of the string s, with all leading // and trailing white space removed, as defined by Unicode. func TrimSpace(s string) string { - return TrimFunc(s, unicode.IsSpace) + // Fast path for ASCII: look for the first ASCII non-space byte + start := 0 + for ; start < len(s); start++ { + c := s[start] + if c >= utf8.RuneSelf { + // If we run into a non-ASCII byte, fall back to the + // slower unicode-aware method on the remaining bytes + return TrimFunc(s[start:], unicode.IsSpace) + } + if asciiSpace[c] == 0 { + break + } + } + + // Now look for the first ASCII non-space byte from the end + stop := len(s) + for ; stop > start; stop-- { + c := s[stop-1] + if c >= utf8.RuneSelf { + return TrimFunc(s[start:stop], unicode.IsSpace) + } + if asciiSpace[c] == 0 { + break + } + } + + // At this point s[start:stop] starts and ends with an ASCII + // non-space bytes, so we're done. Non-ASCII cases have already + // been handled above. + return s[start:stop] } // TrimPrefix returns s without the provided leading prefix string. diff --git a/libgo/go/strings/strings_test.go b/libgo/go/strings/strings_test.go index ae17eba..76d827b 100644 --- a/libgo/go/strings/strings_test.go +++ b/libgo/go/strings/strings_test.go @@ -200,6 +200,18 @@ func TestLastIndex(t *testing.T) { runIndexTests(t, LastIndex, "LastIndex", l func TestIndexAny(t *testing.T) { runIndexTests(t, IndexAny, "IndexAny", indexAnyTests) } func TestLastIndexAny(t *testing.T) { runIndexTests(t, LastIndexAny, "LastIndexAny", lastIndexAnyTests) } +func TestIndexByte(t *testing.T) { + for _, tt := range indexTests { + if len(tt.sep) != 1 { + continue + } + pos := IndexByte(tt.s, tt.sep[0]) + if pos != tt.out { + t.Errorf(`IndexByte(%q, %q) = %v; want %v`, tt.s, tt.sep[0], pos, tt.out) + } + } +} + func TestLastIndexByte(t *testing.T) { testCases := []IndexTest{ {"", "q", -1}, @@ -697,6 +709,36 @@ func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTest func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) } +var toValidUTF8Tests = []struct { + in string + repl string + out string +}{ + {"", "\uFFFD", ""}, + {"abc", "\uFFFD", "abc"}, + {"\uFDDD", "\uFFFD", "\uFDDD"}, + {"a\xffb", "\uFFFD", "a\uFFFDb"}, + {"a\xffb\uFFFD", "X", "aXb\uFFFD"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"}, + {"\xC0\xAF", "\uFFFD", "\uFFFD"}, + {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xed\xa0\x80", "abc", "abc"}, + {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"}, + {"\xF0\x80\x80\xaf", "☺", "☺"}, + {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, +} + +func TestToValidUTF8(t *testing.T) { + for _, tc := range toValidUTF8Tests { + got := ToValidUTF8(tc.in, tc.repl) + if got != tc.out { + t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out) + } + } +} + func BenchmarkToUpper(b *testing.B) { for _, tc := range upperTests { b.Run(tc.in, func(b *testing.B) { @@ -843,6 +885,26 @@ func BenchmarkTrim(b *testing.B) { } } +func BenchmarkToValidUTF8(b *testing.B) { + tests := []struct { + name string + input string + }{ + {"Valid", "typical"}, + {"InvalidASCII", "foo\xffbar"}, + {"InvalidNonASCII", "日本語\xff日本語"}, + } + replacement := "\uFFFD" + b.ResetTimer() + for _, test := range tests { + b.Run(test.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToValidUTF8(test.input, replacement) + } + }) + } +} + type predicate struct { f func(rune) bool name string @@ -868,23 +930,66 @@ func not(p predicate) predicate { } var trimFuncTests = []struct { - f predicate - in, out string + f predicate + in string + trimOut string + leftOut string + rightOut string }{ - {isSpace, space + " hello " + space, "hello"}, - {isDigit, "\u0e50\u0e5212hello34\u0e50\u0e51", "hello"}, - {isUpper, "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", "hello"}, - {not(isSpace), "hello" + space + "hello", space}, - {not(isDigit), "hello\u0e50\u0e521234\u0e50\u0e51helo", "\u0e50\u0e521234\u0e50\u0e51"}, - {isValidRune, "ab\xc0a\xc0cd", "\xc0a\xc0"}, - {not(isValidRune), "\xc0a\xc0", "a"}, + {isSpace, space + " hello " + space, + "hello", + "hello " + space, + space + " hello"}, + {isDigit, "\u0e50\u0e5212hello34\u0e50\u0e51", + "hello", + "hello34\u0e50\u0e51", + "\u0e50\u0e5212hello"}, + {isUpper, "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", + "hello", + "helloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", + "\u2C6F\u2C6F\u2C6F\u2C6FABCDhello"}, + {not(isSpace), "hello" + space + "hello", + space, + space + "hello", + "hello" + space}, + {not(isDigit), "hello\u0e50\u0e521234\u0e50\u0e51helo", + "\u0e50\u0e521234\u0e50\u0e51", + "\u0e50\u0e521234\u0e50\u0e51helo", + "hello\u0e50\u0e521234\u0e50\u0e51"}, + {isValidRune, "ab\xc0a\xc0cd", + "\xc0a\xc0", + "\xc0a\xc0cd", + "ab\xc0a\xc0"}, + {not(isValidRune), "\xc0a\xc0", + "a", + "a\xc0", + "\xc0a"}, + {isSpace, "", + "", + "", + ""}, + {isSpace, " ", + "", + "", + ""}, } func TestTrimFunc(t *testing.T) { for _, tc := range trimFuncTests { - actual := TrimFunc(tc.in, tc.f.f) - if actual != tc.out { - t.Errorf("TrimFunc(%q, %q) = %q; want %q", tc.in, tc.f.name, actual, tc.out) + trimmers := []struct { + name string + trim func(s string, f func(r rune) bool) string + out string + }{ + {"TrimFunc", TrimFunc, tc.trimOut}, + {"TrimLeftFunc", TrimLeftFunc, tc.leftOut}, + {"TrimRightFunc", TrimRightFunc, tc.rightOut}, + } + for _, trimmer := range trimmers { + actual := trimmer.trim(tc.in, tc.f.f) + if actual != trimmer.out { + t.Errorf("%s(%q, %q) = %q; want %q", trimmer.name, tc.in, tc.f.name, actual, trimmer.out) + } } } } @@ -1735,3 +1840,19 @@ func BenchmarkJoin(b *testing.B) { }) } } + +func BenchmarkTrimSpace(b *testing.B) { + tests := []struct{ name, input string }{ + {"NoTrim", "typical"}, + {"ASCII", " foo bar "}, + {"SomeNonASCII", " \u2000\t\r\n x\t\t\r\r\ny\n \u3000 "}, + {"JustNonASCII", "\u2000\u2000\u2000☺☺☺☺\u3000\u3000\u3000"}, + } + for _, test := range tests { + b.Run(test.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + TrimSpace(test.input) + } + }) + } +} |