aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/unicode/utf8/utf8.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/unicode/utf8/utf8.go')
-rw-r--r--libgo/go/unicode/utf8/utf8.go54
1 files changed, 50 insertions, 4 deletions
diff --git a/libgo/go/unicode/utf8/utf8.go b/libgo/go/unicode/utf8/utf8.go
index 57ea19e..ad23577 100644
--- a/libgo/go/unicode/utf8/utf8.go
+++ b/libgo/go/unicode/utf8/utf8.go
@@ -18,6 +18,12 @@ const (
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
)
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+ surrogateMin = 0xD800
+ surrogateMax = 0xDFFF
+)
+
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
@@ -34,7 +40,6 @@ const (
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
- rune4Max = 1<<21 - 1
)
func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
@@ -87,6 +92,9 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -102,7 +110,7 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
// 4-byte, 21-bit sequence?
if c0 < t5 {
r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
- if r <= rune3Max {
+ if r <= rune3Max || MaxRune < r {
return RuneError, 1, false
}
return r, 4, false
@@ -162,6 +170,9 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -177,7 +188,7 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
// 4-byte, 21-bit sequence?
if c0 < t5 {
r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
- if r <= rune3Max {
+ if r <= rune3Max || MaxRune < r {
return RuneError, 1, false
}
return r, 4, false
@@ -202,6 +213,9 @@ func FullRuneInString(s string) bool {
// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeRune(p []byte) (r rune, size int) {
r, size, _ = decodeRuneInternal(p)
return
@@ -209,6 +223,9 @@ func DecodeRune(p []byte) (r rune, size int) {
// DecodeRuneInString is like DecodeRune but its input is a string.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeRuneInString(s string) (r rune, size int) {
r, size, _ = decodeRuneInStringInternal(s)
return
@@ -216,6 +233,9 @@ func DecodeRuneInString(s string) (r rune, size int) {
// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeLastRune(p []byte) (r rune, size int) {
end := len(p)
if end == 0 {
@@ -250,6 +270,9 @@ func DecodeLastRune(p []byte) (r rune, size int) {
// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeLastRuneInString(s string) (r rune, size int) {
end := len(s)
if end == 0 {
@@ -283,15 +306,20 @@ func DecodeLastRuneInString(s string) (r rune, size int) {
}
// RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
func RuneLen(r rune) int {
switch {
+ case r < 0:
+ return -1
case r <= rune1Max:
return 1
case r <= rune2Max:
return 2
+ case surrogateMin <= r && r <= surrogateMax:
+ return -1
case r <= rune3Max:
return 3
- case r <= rune4Max:
+ case r <= MaxRune:
return 4
}
return -1
@@ -316,6 +344,10 @@ func EncodeRune(p []byte, r rune) int {
r = RuneError
}
+ if surrogateMin <= r && r <= surrogateMax {
+ r = RuneError
+ }
+
if uint32(r) <= rune3Max {
p[0] = t3 | byte(r>>12)
p[1] = tx | byte(r>>6)&maskx
@@ -395,3 +427,17 @@ func ValidString(s string) bool {
}
return true
}
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+ switch {
+ case r < 0:
+ return false
+ case surrogateMin <= r && r <= surrogateMax:
+ return false
+ case r > MaxRune:
+ return false
+ }
+ return true
+}