aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/strings/strings.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/strings/strings.go')
-rw-r--r--libgo/go/strings/strings.go147
1 files changed, 113 insertions, 34 deletions
diff --git a/libgo/go/strings/strings.go b/libgo/go/strings/strings.go
index a98f5d8..7337481 100644
--- a/libgo/go/strings/strings.go
+++ b/libgo/go/strings/strings.go
@@ -341,38 +341,38 @@ func Fields(s string) []string {
wasSpace = isSpace
}
- if setBits < utf8.RuneSelf { // ASCII fast path
- a := make([]string, n)
- na := 0
- fieldStart := 0
- i := 0
- // Skip spaces in the front of the input.
- for i < len(s) && asciiSpace[s[i]] != 0 {
+ if setBits >= utf8.RuneSelf {
+ // Some runes in the input string are not ASCII.
+ return FieldsFunc(s, unicode.IsSpace)
+ }
+ // ASCII fast path
+ a := make([]string, n)
+ na := 0
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ for i < len(s) {
+ if asciiSpace[s[i]] == 0 {
i++
+ continue
}
- fieldStart = i
- for i < len(s) {
- if asciiSpace[s[i]] == 0 {
- i++
- continue
- }
- a[na] = s[fieldStart:i]
- na++
+ a[na] = s[fieldStart:i]
+ na++
+ i++
+ // Skip spaces in between fields.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
i++
- // Skip spaces in between fields.
- for i < len(s) && asciiSpace[s[i]] != 0 {
- i++
- }
- fieldStart = i
}
- if fieldStart < len(s) { // Last field might end at EOF.
- a[na] = s[fieldStart:]
- }
- return a
+ fieldStart = i
}
-
- // Some runes in the input string are not ASCII.
- return FieldsFunc(s, unicode.IsSpace)
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a[na] = s[fieldStart:]
+ }
+ return a
}
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
@@ -550,7 +550,7 @@ func Repeat(s string, count int) string {
return b.String()
}
-// ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
+// ToUpper returns s with all Unicode letters mapped to their upper case.
func ToUpper(s string) string {
isASCII, hasLower := true, false
for i := 0; i < len(s); i++ {
@@ -559,7 +559,7 @@ func ToUpper(s string) string {
isASCII = false
break
}
- hasLower = hasLower || (c >= 'a' && c <= 'z')
+ hasLower = hasLower || ('a' <= c && c <= 'z')
}
if isASCII { // optimize for ASCII-only strings.
@@ -570,7 +570,7 @@ func ToUpper(s string) string {
b.Grow(len(s))
for i := 0; i < len(s); i++ {
c := s[i]
- if c >= 'a' && c <= 'z' {
+ if 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
b.WriteByte(c)
@@ -580,7 +580,7 @@ func ToUpper(s string) string {
return Map(unicode.ToUpper, s)
}
-// ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
+// ToLower returns s with all Unicode letters mapped to their lower case.
func ToLower(s string) string {
isASCII, hasUpper := true, false
for i := 0; i < len(s); i++ {
@@ -589,7 +589,7 @@ func ToLower(s string) string {
isASCII = false
break
}
- hasUpper = hasUpper || (c >= 'A' && c <= 'Z')
+ hasUpper = hasUpper || ('A' <= c && c <= 'Z')
}
if isASCII { // optimize for ASCII-only strings.
@@ -600,7 +600,7 @@ func ToLower(s string) string {
b.Grow(len(s))
for i := 0; i < len(s); i++ {
c := s[i]
- if c >= 'A' && c <= 'Z' {
+ if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
b.WriteByte(c)
@@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string {
return Map(c.ToTitle, s)
}
+// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
+// replaced by the replacement string, which may be empty.
+func ToValidUTF8(s, replacement string) string {
+ var b Builder
+
+ for i, c := range s {
+ if c != utf8.RuneError {
+ continue
+ }
+
+ _, wid := utf8.DecodeRuneInString(s[i:])
+ if wid == 1 {
+ b.Grow(len(s) + len(replacement))
+ b.WriteString(s[:i])
+ s = s[i:]
+ break
+ }
+ }
+
+ // Fast path for unchanged input
+ if b.Cap() == 0 { // didn't call b.Grow above
+ return s
+ }
+
+ invalid := false // previous byte was from an invalid UTF-8 sequence
+ for i := 0; i < len(s); {
+ c := s[i]
+ if c < utf8.RuneSelf {
+ i++
+ invalid = false
+ b.WriteByte(c)
+ continue
+ }
+ _, wid := utf8.DecodeRuneInString(s[i:])
+ if wid == 1 {
+ i++
+ if !invalid {
+ invalid = true
+ b.WriteString(replacement)
+ }
+ continue
+ }
+ invalid = false
+ b.WriteString(s[i : i+wid])
+ i += wid
+ }
+
+ return b.String()
+}
+
// isSeparator reports whether the rune could mark a word boundary.
// TODO: update when package unicode captures more of the properties.
func isSeparator(r rune) bool {
@@ -818,7 +868,36 @@ func TrimRight(s string, cutset string) string {
// TrimSpace returns a slice of the string s, with all leading
// and trailing white space removed, as defined by Unicode.
func TrimSpace(s string) string {
- return TrimFunc(s, unicode.IsSpace)
+ // Fast path for ASCII: look for the first ASCII non-space byte
+ start := 0
+ for ; start < len(s); start++ {
+ c := s[start]
+ if c >= utf8.RuneSelf {
+ // If we run into a non-ASCII byte, fall back to the
+ // slower unicode-aware method on the remaining bytes
+ return TrimFunc(s[start:], unicode.IsSpace)
+ }
+ if asciiSpace[c] == 0 {
+ break
+ }
+ }
+
+ // Now look for the first ASCII non-space byte from the end
+ stop := len(s)
+ for ; stop > start; stop-- {
+ c := s[stop-1]
+ if c >= utf8.RuneSelf {
+ return TrimFunc(s[start:stop], unicode.IsSpace)
+ }
+ if asciiSpace[c] == 0 {
+ break
+ }
+ }
+
+ // At this point s[start:stop] starts and ends with an ASCII
+ // non-space bytes, so we're done. Non-ASCII cases have already
+ // been handled above.
+ return s[start:stop]
}
// TrimPrefix returns s without the provided leading prefix string.