diff options
author | Ian Lance Taylor <ian@gcc.gnu.org> | 2012-11-21 07:03:38 +0000 |
---|---|---|
committer | Ian Lance Taylor <ian@gcc.gnu.org> | 2012-11-21 07:03:38 +0000 |
commit | fabcaa8df3d6eb852b87821ef090d31d222870b7 (patch) | |
tree | 72455aea0286937aa08cc141e5efc800e4626577 /libgo/go/encoding/xml/xml.go | |
parent | a51fb17f48428e7cfc96a72a9f9f87901363bb6b (diff) | |
download | gcc-fabcaa8df3d6eb852b87821ef090d31d222870b7.zip gcc-fabcaa8df3d6eb852b87821ef090d31d222870b7.tar.gz gcc-fabcaa8df3d6eb852b87821ef090d31d222870b7.tar.bz2 |
libgo: Update to current version of master library.
From-SVN: r193688
Diffstat (limited to 'libgo/go/encoding/xml/xml.go')
-rw-r--r-- | libgo/go/encoding/xml/xml.go | 243 |
1 files changed, 139 insertions, 104 deletions
diff --git a/libgo/go/encoding/xml/xml.go b/libgo/go/encoding/xml/xml.go index fbd2208..decb2be 100644 --- a/libgo/go/encoding/xml/xml.go +++ b/libgo/go/encoding/xml/xml.go @@ -181,7 +181,6 @@ type Decoder struct { ns map[string]string err error line int - tmp [32]byte } // NewDecoder creates a new XML parser reading from r. @@ -877,94 +876,103 @@ Input: // XML in all its glory allows a document to define and use // its own character names with <!ENTITY ...> directives. // Parsers are required to recognize lt, gt, amp, apos, and quot - // even if they have not been declared. That's all we allow. - var i int - var semicolon bool - var valid bool - for i = 0; i < len(d.tmp); i++ { - var ok bool - d.tmp[i], ok = d.getc() - if !ok { - if d.err == io.EOF { - d.err = d.syntaxError("unexpected EOF") - } + // even if they have not been declared. + before := d.buf.Len() + d.buf.WriteByte('&') + var ok bool + var text string + var haveText bool + if b, ok = d.mustgetc(); !ok { + return nil + } + if b == '#' { + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { return nil } - c := d.tmp[i] - if c == ';' { - semicolon = true - valid = i > 0 - break - } - if 'a' <= c && c <= 'z' || - 'A' <= c && c <= 'Z' || - '0' <= c && c <= '9' || - c == '_' || c == '#' { - continue - } - d.ungetc(c) - break - } - s := string(d.tmp[0:i]) - if !valid { - if !d.Strict { - b0, b1 = 0, 0 - d.buf.WriteByte('&') - d.buf.Write(d.tmp[0:i]) - if semicolon { - d.buf.WriteByte(';') + base := 10 + if b == 'x' { + base = 16 + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { + return nil } - continue Input } - semi := ";" - if !semicolon { - semi = " (no semicolon)" - } - if i < len(d.tmp) { - d.err = d.syntaxError("invalid character entity &" + s + semi) - } else { - d.err = d.syntaxError("invalid character entity &" + s + "... too long") + start := d.buf.Len() + for '0' <= b && b <= '9' || + base == 16 && 'a' <= b && b <= 'f' || + base == 16 && 'A' <= b && b <= 'F' { + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { + return nil + } } - return nil - } - var haveText bool - var text string - if i >= 2 && s[0] == '#' { - var n uint64 - var err error - if i >= 3 && s[1] == 'x' { - n, err = strconv.ParseUint(s[2:], 16, 64) + if b != ';' { + d.ungetc(b) } else { - n, err = strconv.ParseUint(s[1:], 10, 64) - } - if err == nil && n <= unicode.MaxRune { - text = string(n) - haveText = true + s := string(d.buf.Bytes()[start:]) + d.buf.WriteByte(';') + n, err := strconv.ParseUint(s, base, 64) + if err == nil && n <= unicode.MaxRune { + text = string(n) + haveText = true + } } } else { - if r, ok := entity[s]; ok { - text = string(r) - haveText = true - } else if d.Entity != nil { - text, haveText = d.Entity[s] + d.ungetc(b) + if !d.readName() { + if d.err != nil { + return nil + } + ok = false } - } - if !haveText { - if !d.Strict { - b0, b1 = 0, 0 - d.buf.WriteByte('&') - d.buf.Write(d.tmp[0:i]) + if b, ok = d.mustgetc(); !ok { + return nil + } + if b != ';' { + d.ungetc(b) + } else { + name := d.buf.Bytes()[before+1:] d.buf.WriteByte(';') - continue Input + if isName(name) { + s := string(name) + if r, ok := entity[s]; ok { + text = string(r) + haveText = true + } else if d.Entity != nil { + text, haveText = d.Entity[s] + } + } } - d.err = d.syntaxError("invalid character entity &" + s + ";") - return nil } - d.buf.Write([]byte(text)) - b0, b1 = 0, 0 - continue Input + + if haveText { + d.buf.Truncate(before) + d.buf.Write([]byte(text)) + b0, b1 = 0, 0 + continue Input + } + if !d.Strict { + b0, b1 = 0, 0 + continue Input + } + ent := string(d.buf.Bytes()[before]) + if ent[len(ent)-1] != ';' { + ent += " (no semicolon)" + } + d.err = d.syntaxError("invalid character entity " + ent) + return nil } - d.buf.WriteByte(b) + + // We must rewrite unescaped \r and \r\n into \n. + if b == '\r' { + d.buf.WriteByte('\n') + } else if b1 == '\r' && b == '\n' { + // Skip \r\n--we already wrote \n. + } else { + d.buf.WriteByte(b) + } + b0, b1 = b1, b } data := d.buf.Bytes() @@ -985,20 +993,7 @@ Input: } } - // Must rewrite \r and \r\n into \n. - w := 0 - for r := 0; r < len(data); r++ { - b := data[r] - if b == '\r' { - if r+1 < len(data) && data[r+1] == '\n' { - continue - } - b = '\n' - } - data[w] = b - w++ - } - return data[0:w] + return data } // Decide whether the given rune is in the XML Character Range, per @@ -1034,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) { // Do not set d.err if the name is missing (unless unexpected EOF is received): // let the caller provide better context. func (d *Decoder) name() (s string, ok bool) { + d.buf.Reset() + if !d.readName() { + return "", false + } + + // Now we check the characters. + s = d.buf.String() + if !isName([]byte(s)) { + d.err = d.syntaxError("invalid XML name: " + s) + return "", false + } + return s, true +} + +// Read a name and append its bytes to d.buf. +// The name is delimited by any single-byte character not valid in names. +// All multi-byte characters are accepted; the caller must check their validity. +func (d *Decoder) readName() (ok bool) { var b byte if b, ok = d.mustgetc(); !ok { return } - - // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]* if b < utf8.RuneSelf && !isNameByte(b) { d.ungetc(b) - return "", false + return false } - d.buf.Reset() d.buf.WriteByte(b) + for { if b, ok = d.mustgetc(); !ok { return @@ -1056,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) { } d.buf.WriteByte(b) } - - // Then we check the characters. - s = d.buf.String() - for i, c := range s { - if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) { - d.err = d.syntaxError("invalid XML name: " + s) - return "", false - } - } - return s, true + return true } func isNameByte(c byte) bool { @@ -1075,6 +1077,30 @@ func isNameByte(c byte) bool { c == '_' || c == ':' || c == '.' || c == '-' } +func isName(s []byte) bool { + if len(s) == 0 { + return false + } + c, n := utf8.DecodeRune(s) + if c == utf8.RuneError && n == 1 { + return false + } + if !unicode.Is(first, c) { + return false + } + for n < len(s) { + s = s[n:] + c, n = utf8.DecodeRune(s) + if c == utf8.RuneError && n == 1 { + return false + } + if !unicode.Is(first, c) && !unicode.Is(second, c) { + return false + } + } + return true +} + // These tables were generated by cut and paste from Appendix B of // the XML spec at http://www.xml.com/axml/testaxml.htm // and then reformatting. First corresponds to (Letter | '_' | ':') @@ -1689,6 +1715,9 @@ var ( esc_amp = []byte("&") esc_lt = []byte("<") esc_gt = []byte(">") + esc_tab = []byte("	") + esc_nl = []byte("
") + esc_cr = []byte("
") ) // Escape writes to w the properly escaped XML equivalent @@ -1708,6 +1737,12 @@ func Escape(w io.Writer, s []byte) { esc = esc_lt case '>': esc = esc_gt + case '\t': + esc = esc_tab + case '\n': + esc = esc_nl + case '\r': + esc = esc_cr default: continue } |