diff options
Diffstat (limited to 'libgo/go/unicode/utf8/utf8.go')
-rw-r--r-- | libgo/go/unicode/utf8/utf8.go | 54 |
1 files changed, 50 insertions, 4 deletions
diff --git a/libgo/go/unicode/utf8/utf8.go b/libgo/go/unicode/utf8/utf8.go index 57ea19e..ad23577 100644 --- a/libgo/go/unicode/utf8/utf8.go +++ b/libgo/go/unicode/utf8/utf8.go @@ -18,6 +18,12 @@ const ( UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. ) +// Code points in the surrogate range are not valid for UTF-8. +const ( + surrogateMin = 0xD800 + surrogateMax = 0xDFFF +) + const ( t1 = 0x00 // 0000 0000 tx = 0x80 // 1000 0000 @@ -34,7 +40,6 @@ const ( rune1Max = 1<<7 - 1 rune2Max = 1<<11 - 1 rune3Max = 1<<16 - 1 - rune4Max = 1<<21 - 1 ) func decodeRuneInternal(p []byte) (r rune, size int, short bool) { @@ -87,6 +92,9 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) { if r <= rune2Max { return RuneError, 1, false } + if surrogateMin <= r && r <= surrogateMax { + return RuneError, 1, false + } return r, 3, false } @@ -102,7 +110,7 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) { // 4-byte, 21-bit sequence? if c0 < t5 { r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) - if r <= rune3Max { + if r <= rune3Max || MaxRune < r { return RuneError, 1, false } return r, 4, false @@ -162,6 +170,9 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { if r <= rune2Max { return RuneError, 1, false } + if surrogateMin <= r && r <= surrogateMax { + return RuneError, 1, false + } return r, 3, false } @@ -177,7 +188,7 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { // 4-byte, 21-bit sequence? if c0 < t5 { r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) - if r <= rune3Max { + if r <= rune3Max || MaxRune < r { return RuneError, 1, false } return r, 4, false @@ -202,6 +213,9 @@ func FullRuneInString(s string) bool { // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. +// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is +// out of range, or is not the shortest possible UTF-8 encoding for the +// value. No other validation is performed. func DecodeRune(p []byte) (r rune, size int) { r, size, _ = decodeRuneInternal(p) return @@ -209,6 +223,9 @@ func DecodeRune(p []byte) (r rune, size int) { // DecodeRuneInString is like DecodeRune but its input is a string. // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. +// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is +// out of range, or is not the shortest possible UTF-8 encoding for the +// value. No other validation is performed. func DecodeRuneInString(s string) (r rune, size int) { r, size, _ = decodeRuneInStringInternal(s) return @@ -216,6 +233,9 @@ func DecodeRuneInString(s string) (r rune, size int) { // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. +// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is +// out of range, or is not the shortest possible UTF-8 encoding for the +// value. No other validation is performed. func DecodeLastRune(p []byte) (r rune, size int) { end := len(p) if end == 0 { @@ -250,6 +270,9 @@ func DecodeLastRune(p []byte) (r rune, size int) { // DecodeLastRuneInString is like DecodeLastRune but its input is a string. // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. +// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is +// out of range, or is not the shortest possible UTF-8 encoding for the +// value. No other validation is performed. func DecodeLastRuneInString(s string) (r rune, size int) { end := len(s) if end == 0 { @@ -283,15 +306,20 @@ func DecodeLastRuneInString(s string) (r rune, size int) { } // RuneLen returns the number of bytes required to encode the rune. +// It returns -1 if the rune is not a valid value to encode in UTF-8. func RuneLen(r rune) int { switch { + case r < 0: + return -1 case r <= rune1Max: return 1 case r <= rune2Max: return 2 + case surrogateMin <= r && r <= surrogateMax: + return -1 case r <= rune3Max: return 3 - case r <= rune4Max: + case r <= MaxRune: return 4 } return -1 @@ -316,6 +344,10 @@ func EncodeRune(p []byte, r rune) int { r = RuneError } + if surrogateMin <= r && r <= surrogateMax { + r = RuneError + } + if uint32(r) <= rune3Max { p[0] = t3 | byte(r>>12) p[1] = tx | byte(r>>6)&maskx @@ -395,3 +427,17 @@ func ValidString(s string) bool { } return true } + +// ValidRune reports whether r can be legally encoded as UTF-8. +// Code points that are out of range or a surrogate half are illegal. +func ValidRune(r rune) bool { + switch { + case r < 0: + return false + case surrogateMin <= r && r <= surrogateMax: + return false + case r > MaxRune: + return false + } + return true +} |