diff options
Diffstat (limited to 'libgo/go/encoding/xml')
-rw-r--r-- | libgo/go/encoding/xml/marshal_test.go | 21 | ||||
-rw-r--r-- | libgo/go/encoding/xml/xml.go | 243 | ||||
-rw-r--r-- | libgo/go/encoding/xml/xml_test.go | 35 |
3 files changed, 195 insertions, 104 deletions
diff --git a/libgo/go/encoding/xml/marshal_test.go b/libgo/go/encoding/xml/marshal_test.go index e729a24..668fea6 100644 --- a/libgo/go/encoding/xml/marshal_test.go +++ b/libgo/go/encoding/xml/marshal_test.go @@ -687,6 +687,27 @@ var marshalTests = []struct { Value: &IgnoreTest{}, UnmarshalOnly: true, }, + + // Test escaping. + { + ExpectXML: `<a><nested><value>dquote: "; squote: '; ampersand: &; less: <; greater: >;</value></nested></a>`, + Value: &AnyTest{ + Nested: `dquote: "; squote: '; ampersand: &; less: <; greater: >;`, + }, + }, + { + ExpectXML: `<a><nested><value>newline: 
; cr: 
; tab: 	;</value></nested></a>`, + Value: &AnyTest{ + Nested: "newline: \n; cr: \r; tab: \t;", + }, + }, + { + ExpectXML: "<a><nested><value>1\r2\r\n3\n\r4\n5</value></nested></a>", + Value: &AnyTest{ + Nested: "1\n2\n3\n\n4\n5", + }, + UnmarshalOnly: true, + }, } func TestMarshal(t *testing.T) { diff --git a/libgo/go/encoding/xml/xml.go b/libgo/go/encoding/xml/xml.go index fbd2208..decb2be 100644 --- a/libgo/go/encoding/xml/xml.go +++ b/libgo/go/encoding/xml/xml.go @@ -181,7 +181,6 @@ type Decoder struct { ns map[string]string err error line int - tmp [32]byte } // NewDecoder creates a new XML parser reading from r. @@ -877,94 +876,103 @@ Input: // XML in all its glory allows a document to define and use // its own character names with <!ENTITY ...> directives. // Parsers are required to recognize lt, gt, amp, apos, and quot - // even if they have not been declared. That's all we allow. - var i int - var semicolon bool - var valid bool - for i = 0; i < len(d.tmp); i++ { - var ok bool - d.tmp[i], ok = d.getc() - if !ok { - if d.err == io.EOF { - d.err = d.syntaxError("unexpected EOF") - } + // even if they have not been declared. + before := d.buf.Len() + d.buf.WriteByte('&') + var ok bool + var text string + var haveText bool + if b, ok = d.mustgetc(); !ok { + return nil + } + if b == '#' { + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { return nil } - c := d.tmp[i] - if c == ';' { - semicolon = true - valid = i > 0 - break - } - if 'a' <= c && c <= 'z' || - 'A' <= c && c <= 'Z' || - '0' <= c && c <= '9' || - c == '_' || c == '#' { - continue - } - d.ungetc(c) - break - } - s := string(d.tmp[0:i]) - if !valid { - if !d.Strict { - b0, b1 = 0, 0 - d.buf.WriteByte('&') - d.buf.Write(d.tmp[0:i]) - if semicolon { - d.buf.WriteByte(';') + base := 10 + if b == 'x' { + base = 16 + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { + return nil } - continue Input } - semi := ";" - if !semicolon { - semi = " (no semicolon)" - } - if i < len(d.tmp) { - d.err = d.syntaxError("invalid character entity &" + s + semi) - } else { - d.err = d.syntaxError("invalid character entity &" + s + "... too long") + start := d.buf.Len() + for '0' <= b && b <= '9' || + base == 16 && 'a' <= b && b <= 'f' || + base == 16 && 'A' <= b && b <= 'F' { + d.buf.WriteByte(b) + if b, ok = d.mustgetc(); !ok { + return nil + } } - return nil - } - var haveText bool - var text string - if i >= 2 && s[0] == '#' { - var n uint64 - var err error - if i >= 3 && s[1] == 'x' { - n, err = strconv.ParseUint(s[2:], 16, 64) + if b != ';' { + d.ungetc(b) } else { - n, err = strconv.ParseUint(s[1:], 10, 64) - } - if err == nil && n <= unicode.MaxRune { - text = string(n) - haveText = true + s := string(d.buf.Bytes()[start:]) + d.buf.WriteByte(';') + n, err := strconv.ParseUint(s, base, 64) + if err == nil && n <= unicode.MaxRune { + text = string(n) + haveText = true + } } } else { - if r, ok := entity[s]; ok { - text = string(r) - haveText = true - } else if d.Entity != nil { - text, haveText = d.Entity[s] + d.ungetc(b) + if !d.readName() { + if d.err != nil { + return nil + } + ok = false } - } - if !haveText { - if !d.Strict { - b0, b1 = 0, 0 - d.buf.WriteByte('&') - d.buf.Write(d.tmp[0:i]) + if b, ok = d.mustgetc(); !ok { + return nil + } + if b != ';' { + d.ungetc(b) + } else { + name := d.buf.Bytes()[before+1:] d.buf.WriteByte(';') - continue Input + if isName(name) { + s := string(name) + if r, ok := entity[s]; ok { + text = string(r) + haveText = true + } else if d.Entity != nil { + text, haveText = d.Entity[s] + } + } } - d.err = d.syntaxError("invalid character entity &" + s + ";") - return nil } - d.buf.Write([]byte(text)) - b0, b1 = 0, 0 - continue Input + + if haveText { + d.buf.Truncate(before) + d.buf.Write([]byte(text)) + b0, b1 = 0, 0 + continue Input + } + if !d.Strict { + b0, b1 = 0, 0 + continue Input + } + ent := string(d.buf.Bytes()[before]) + if ent[len(ent)-1] != ';' { + ent += " (no semicolon)" + } + d.err = d.syntaxError("invalid character entity " + ent) + return nil } - d.buf.WriteByte(b) + + // We must rewrite unescaped \r and \r\n into \n. + if b == '\r' { + d.buf.WriteByte('\n') + } else if b1 == '\r' && b == '\n' { + // Skip \r\n--we already wrote \n. + } else { + d.buf.WriteByte(b) + } + b0, b1 = b1, b } data := d.buf.Bytes() @@ -985,20 +993,7 @@ Input: } } - // Must rewrite \r and \r\n into \n. - w := 0 - for r := 0; r < len(data); r++ { - b := data[r] - if b == '\r' { - if r+1 < len(data) && data[r+1] == '\n' { - continue - } - b = '\n' - } - data[w] = b - w++ - } - return data[0:w] + return data } // Decide whether the given rune is in the XML Character Range, per @@ -1034,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) { // Do not set d.err if the name is missing (unless unexpected EOF is received): // let the caller provide better context. func (d *Decoder) name() (s string, ok bool) { + d.buf.Reset() + if !d.readName() { + return "", false + } + + // Now we check the characters. + s = d.buf.String() + if !isName([]byte(s)) { + d.err = d.syntaxError("invalid XML name: " + s) + return "", false + } + return s, true +} + +// Read a name and append its bytes to d.buf. +// The name is delimited by any single-byte character not valid in names. +// All multi-byte characters are accepted; the caller must check their validity. +func (d *Decoder) readName() (ok bool) { var b byte if b, ok = d.mustgetc(); !ok { return } - - // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]* if b < utf8.RuneSelf && !isNameByte(b) { d.ungetc(b) - return "", false + return false } - d.buf.Reset() d.buf.WriteByte(b) + for { if b, ok = d.mustgetc(); !ok { return @@ -1056,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) { } d.buf.WriteByte(b) } - - // Then we check the characters. - s = d.buf.String() - for i, c := range s { - if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) { - d.err = d.syntaxError("invalid XML name: " + s) - return "", false - } - } - return s, true + return true } func isNameByte(c byte) bool { @@ -1075,6 +1077,30 @@ func isNameByte(c byte) bool { c == '_' || c == ':' || c == '.' || c == '-' } +func isName(s []byte) bool { + if len(s) == 0 { + return false + } + c, n := utf8.DecodeRune(s) + if c == utf8.RuneError && n == 1 { + return false + } + if !unicode.Is(first, c) { + return false + } + for n < len(s) { + s = s[n:] + c, n = utf8.DecodeRune(s) + if c == utf8.RuneError && n == 1 { + return false + } + if !unicode.Is(first, c) && !unicode.Is(second, c) { + return false + } + } + return true +} + // These tables were generated by cut and paste from Appendix B of // the XML spec at http://www.xml.com/axml/testaxml.htm // and then reformatting. First corresponds to (Letter | '_' | ':') @@ -1689,6 +1715,9 @@ var ( esc_amp = []byte("&") esc_lt = []byte("<") esc_gt = []byte(">") + esc_tab = []byte("	") + esc_nl = []byte("
") + esc_cr = []byte("
") ) // Escape writes to w the properly escaped XML equivalent @@ -1708,6 +1737,12 @@ func Escape(w io.Writer, s []byte) { esc = esc_lt case '>': esc = esc_gt + case '\t': + esc = esc_tab + case '\n': + esc = esc_nl + case '\r': + esc = esc_cr default: continue } diff --git a/libgo/go/encoding/xml/xml_test.go b/libgo/go/encoding/xml/xml_test.go index 2ad4d4a..981d352 100644 --- a/libgo/go/encoding/xml/xml_test.go +++ b/libgo/go/encoding/xml/xml_test.go @@ -19,6 +19,7 @@ const testInput = ` <body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` + "\r\n\t" + ` > <hello lang="en">World <>'" 白鵬翔</hello> + <query>&何; &is-it;</query> <goodbye /> <outer foo:attr="value" xmlns:tag="ns4"> <inner/> @@ -28,6 +29,8 @@ const testInput = ` </tag:name> </body><!-- missing final newline -->` +var testEntity = map[string]string{"何": "What", "is-it": "is it?"} + var rawTokens = []Token{ CharData("\n"), ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)}, @@ -41,6 +44,10 @@ var rawTokens = []Token{ CharData("World <>'\" 白鵬翔"), EndElement{Name{"", "hello"}}, CharData("\n "), + StartElement{Name{"", "query"}, []Attr{}}, + CharData("What is it?"), + EndElement{Name{"", "query"}}, + CharData("\n "), StartElement{Name{"", "goodbye"}, []Attr{}}, EndElement{Name{"", "goodbye"}}, CharData("\n "), @@ -74,6 +81,10 @@ var cookedTokens = []Token{ CharData("World <>'\" 白鵬翔"), EndElement{Name{"ns2", "hello"}}, CharData("\n "), + StartElement{Name{"ns2", "query"}, []Attr{}}, + CharData("What is it?"), + EndElement{Name{"ns2", "query"}}, + CharData("\n "), StartElement{Name{"ns2", "goodbye"}, []Attr{}}, EndElement{Name{"ns2", "goodbye"}}, CharData("\n "), @@ -156,6 +167,7 @@ var xmlInput = []string{ func TestRawToken(t *testing.T) { d := NewDecoder(strings.NewReader(testInput)) + d.Entity = testEntity testRawToken(t, d, rawTokens) } @@ -164,8 +176,14 @@ const nonStrictInput = ` <tag>&unknown;entity</tag> <tag>{</tag> <tag>&#zzz;</tag> +<tag>&なまえ3;</tag> +<tag><-gt;</tag> +<tag>&;</tag> +<tag>&0a;</tag> ` +var nonStringEntity = map[string]string{"": "oops!", "0a": "oops!"} + var nonStrictTokens = []Token{ CharData("\n"), StartElement{Name{"", "tag"}, []Attr{}}, @@ -184,6 +202,22 @@ var nonStrictTokens = []Token{ CharData("&#zzz;"), EndElement{Name{"", "tag"}}, CharData("\n"), + StartElement{Name{"", "tag"}, []Attr{}}, + CharData("&なまえ3;"), + EndElement{Name{"", "tag"}}, + CharData("\n"), + StartElement{Name{"", "tag"}, []Attr{}}, + CharData("<-gt;"), + EndElement{Name{"", "tag"}}, + CharData("\n"), + StartElement{Name{"", "tag"}, []Attr{}}, + CharData("&;"), + EndElement{Name{"", "tag"}}, + CharData("\n"), + StartElement{Name{"", "tag"}, []Attr{}}, + CharData("&0a;"), + EndElement{Name{"", "tag"}}, + CharData("\n"), } func TestNonStrictRawToken(t *testing.T) { @@ -317,6 +351,7 @@ func TestNestedDirectives(t *testing.T) { func TestToken(t *testing.T) { d := NewDecoder(strings.NewReader(testInput)) + d.Entity = testEntity for i, want := range cookedTokens { have, err := d.Token() |