aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/encoding/xml
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/encoding/xml')
-rw-r--r--libgo/go/encoding/xml/marshal_test.go21
-rw-r--r--libgo/go/encoding/xml/xml.go243
-rw-r--r--libgo/go/encoding/xml/xml_test.go35
3 files changed, 195 insertions, 104 deletions
diff --git a/libgo/go/encoding/xml/marshal_test.go b/libgo/go/encoding/xml/marshal_test.go
index e729a24..668fea6 100644
--- a/libgo/go/encoding/xml/marshal_test.go
+++ b/libgo/go/encoding/xml/marshal_test.go
@@ -687,6 +687,27 @@ var marshalTests = []struct {
Value: &IgnoreTest{},
UnmarshalOnly: true,
},
+
+ // Test escaping.
+ {
+ ExpectXML: `<a><nested><value>dquote: &#34;; squote: &#39;; ampersand: &amp;; less: &lt;; greater: &gt;;</value></nested></a>`,
+ Value: &AnyTest{
+ Nested: `dquote: "; squote: '; ampersand: &; less: <; greater: >;`,
+ },
+ },
+ {
+ ExpectXML: `<a><nested><value>newline: &#xA;; cr: &#xD;; tab: &#x9;;</value></nested></a>`,
+ Value: &AnyTest{
+ Nested: "newline: \n; cr: \r; tab: \t;",
+ },
+ },
+ {
+ ExpectXML: "<a><nested><value>1\r2\r\n3\n\r4\n5</value></nested></a>",
+ Value: &AnyTest{
+ Nested: "1\n2\n3\n\n4\n5",
+ },
+ UnmarshalOnly: true,
+ },
}
func TestMarshal(t *testing.T) {
diff --git a/libgo/go/encoding/xml/xml.go b/libgo/go/encoding/xml/xml.go
index fbd2208..decb2be 100644
--- a/libgo/go/encoding/xml/xml.go
+++ b/libgo/go/encoding/xml/xml.go
@@ -181,7 +181,6 @@ type Decoder struct {
ns map[string]string
err error
line int
- tmp [32]byte
}
// NewDecoder creates a new XML parser reading from r.
@@ -877,94 +876,103 @@ Input:
// XML in all its glory allows a document to define and use
// its own character names with <!ENTITY ...> directives.
// Parsers are required to recognize lt, gt, amp, apos, and quot
- // even if they have not been declared. That's all we allow.
- var i int
- var semicolon bool
- var valid bool
- for i = 0; i < len(d.tmp); i++ {
- var ok bool
- d.tmp[i], ok = d.getc()
- if !ok {
- if d.err == io.EOF {
- d.err = d.syntaxError("unexpected EOF")
- }
+ // even if they have not been declared.
+ before := d.buf.Len()
+ d.buf.WriteByte('&')
+ var ok bool
+ var text string
+ var haveText bool
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
+ if b == '#' {
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
return nil
}
- c := d.tmp[i]
- if c == ';' {
- semicolon = true
- valid = i > 0
- break
- }
- if 'a' <= c && c <= 'z' ||
- 'A' <= c && c <= 'Z' ||
- '0' <= c && c <= '9' ||
- c == '_' || c == '#' {
- continue
- }
- d.ungetc(c)
- break
- }
- s := string(d.tmp[0:i])
- if !valid {
- if !d.Strict {
- b0, b1 = 0, 0
- d.buf.WriteByte('&')
- d.buf.Write(d.tmp[0:i])
- if semicolon {
- d.buf.WriteByte(';')
+ base := 10
+ if b == 'x' {
+ base = 16
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
+ return nil
}
- continue Input
}
- semi := ";"
- if !semicolon {
- semi = " (no semicolon)"
- }
- if i < len(d.tmp) {
- d.err = d.syntaxError("invalid character entity &" + s + semi)
- } else {
- d.err = d.syntaxError("invalid character entity &" + s + "... too long")
+ start := d.buf.Len()
+ for '0' <= b && b <= '9' ||
+ base == 16 && 'a' <= b && b <= 'f' ||
+ base == 16 && 'A' <= b && b <= 'F' {
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
}
- return nil
- }
- var haveText bool
- var text string
- if i >= 2 && s[0] == '#' {
- var n uint64
- var err error
- if i >= 3 && s[1] == 'x' {
- n, err = strconv.ParseUint(s[2:], 16, 64)
+ if b != ';' {
+ d.ungetc(b)
} else {
- n, err = strconv.ParseUint(s[1:], 10, 64)
- }
- if err == nil && n <= unicode.MaxRune {
- text = string(n)
- haveText = true
+ s := string(d.buf.Bytes()[start:])
+ d.buf.WriteByte(';')
+ n, err := strconv.ParseUint(s, base, 64)
+ if err == nil && n <= unicode.MaxRune {
+ text = string(n)
+ haveText = true
+ }
}
} else {
- if r, ok := entity[s]; ok {
- text = string(r)
- haveText = true
- } else if d.Entity != nil {
- text, haveText = d.Entity[s]
+ d.ungetc(b)
+ if !d.readName() {
+ if d.err != nil {
+ return nil
+ }
+ ok = false
}
- }
- if !haveText {
- if !d.Strict {
- b0, b1 = 0, 0
- d.buf.WriteByte('&')
- d.buf.Write(d.tmp[0:i])
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
+ if b != ';' {
+ d.ungetc(b)
+ } else {
+ name := d.buf.Bytes()[before+1:]
d.buf.WriteByte(';')
- continue Input
+ if isName(name) {
+ s := string(name)
+ if r, ok := entity[s]; ok {
+ text = string(r)
+ haveText = true
+ } else if d.Entity != nil {
+ text, haveText = d.Entity[s]
+ }
+ }
}
- d.err = d.syntaxError("invalid character entity &" + s + ";")
- return nil
}
- d.buf.Write([]byte(text))
- b0, b1 = 0, 0
- continue Input
+
+ if haveText {
+ d.buf.Truncate(before)
+ d.buf.Write([]byte(text))
+ b0, b1 = 0, 0
+ continue Input
+ }
+ if !d.Strict {
+ b0, b1 = 0, 0
+ continue Input
+ }
+ ent := string(d.buf.Bytes()[before])
+ if ent[len(ent)-1] != ';' {
+ ent += " (no semicolon)"
+ }
+ d.err = d.syntaxError("invalid character entity " + ent)
+ return nil
}
- d.buf.WriteByte(b)
+
+ // We must rewrite unescaped \r and \r\n into \n.
+ if b == '\r' {
+ d.buf.WriteByte('\n')
+ } else if b1 == '\r' && b == '\n' {
+ // Skip \r\n--we already wrote \n.
+ } else {
+ d.buf.WriteByte(b)
+ }
+
b0, b1 = b1, b
}
data := d.buf.Bytes()
@@ -985,20 +993,7 @@ Input:
}
}
- // Must rewrite \r and \r\n into \n.
- w := 0
- for r := 0; r < len(data); r++ {
- b := data[r]
- if b == '\r' {
- if r+1 < len(data) && data[r+1] == '\n' {
- continue
- }
- b = '\n'
- }
- data[w] = b
- w++
- }
- return data[0:w]
+ return data
}
// Decide whether the given rune is in the XML Character Range, per
@@ -1034,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {
// Do not set d.err if the name is missing (unless unexpected EOF is received):
// let the caller provide better context.
func (d *Decoder) name() (s string, ok bool) {
+ d.buf.Reset()
+ if !d.readName() {
+ return "", false
+ }
+
+ // Now we check the characters.
+ s = d.buf.String()
+ if !isName([]byte(s)) {
+ d.err = d.syntaxError("invalid XML name: " + s)
+ return "", false
+ }
+ return s, true
+}
+
+// Read a name and append its bytes to d.buf.
+// The name is delimited by any single-byte character not valid in names.
+// All multi-byte characters are accepted; the caller must check their validity.
+func (d *Decoder) readName() (ok bool) {
var b byte
if b, ok = d.mustgetc(); !ok {
return
}
-
- // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
if b < utf8.RuneSelf && !isNameByte(b) {
d.ungetc(b)
- return "", false
+ return false
}
- d.buf.Reset()
d.buf.WriteByte(b)
+
for {
if b, ok = d.mustgetc(); !ok {
return
@@ -1056,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {
}
d.buf.WriteByte(b)
}
-
- // Then we check the characters.
- s = d.buf.String()
- for i, c := range s {
- if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
- d.err = d.syntaxError("invalid XML name: " + s)
- return "", false
- }
- }
- return s, true
+ return true
}
func isNameByte(c byte) bool {
@@ -1075,6 +1077,30 @@ func isNameByte(c byte) bool {
c == '_' || c == ':' || c == '.' || c == '-'
}
+func isName(s []byte) bool {
+ if len(s) == 0 {
+ return false
+ }
+ c, n := utf8.DecodeRune(s)
+ if c == utf8.RuneError && n == 1 {
+ return false
+ }
+ if !unicode.Is(first, c) {
+ return false
+ }
+ for n < len(s) {
+ s = s[n:]
+ c, n = utf8.DecodeRune(s)
+ if c == utf8.RuneError && n == 1 {
+ return false
+ }
+ if !unicode.Is(first, c) && !unicode.Is(second, c) {
+ return false
+ }
+ }
+ return true
+}
+
// These tables were generated by cut and paste from Appendix B of
// the XML spec at http://www.xml.com/axml/testaxml.htm
// and then reformatting. First corresponds to (Letter | '_' | ':')
@@ -1689,6 +1715,9 @@ var (
esc_amp = []byte("&amp;")
esc_lt = []byte("&lt;")
esc_gt = []byte("&gt;")
+ esc_tab = []byte("&#x9;")
+ esc_nl = []byte("&#xA;")
+ esc_cr = []byte("&#xD;")
)
// Escape writes to w the properly escaped XML equivalent
@@ -1708,6 +1737,12 @@ func Escape(w io.Writer, s []byte) {
esc = esc_lt
case '>':
esc = esc_gt
+ case '\t':
+ esc = esc_tab
+ case '\n':
+ esc = esc_nl
+ case '\r':
+ esc = esc_cr
default:
continue
}
diff --git a/libgo/go/encoding/xml/xml_test.go b/libgo/go/encoding/xml/xml_test.go
index 2ad4d4a..981d352 100644
--- a/libgo/go/encoding/xml/xml_test.go
+++ b/libgo/go/encoding/xml/xml_test.go
@@ -19,6 +19,7 @@ const testInput = `
<body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` +
"\r\n\t" + ` >
<hello lang="en">World &lt;&gt;&apos;&quot; &#x767d;&#40300;翔</hello>
+ <query>&何; &is-it;</query>
<goodbye />
<outer foo:attr="value" xmlns:tag="ns4">
<inner/>
@@ -28,6 +29,8 @@ const testInput = `
</tag:name>
</body><!-- missing final newline -->`
+var testEntity = map[string]string{"何": "What", "is-it": "is it?"}
+
var rawTokens = []Token{
CharData("\n"),
ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)},
@@ -41,6 +44,10 @@ var rawTokens = []Token{
CharData("World <>'\" 白鵬翔"),
EndElement{Name{"", "hello"}},
CharData("\n "),
+ StartElement{Name{"", "query"}, []Attr{}},
+ CharData("What is it?"),
+ EndElement{Name{"", "query"}},
+ CharData("\n "),
StartElement{Name{"", "goodbye"}, []Attr{}},
EndElement{Name{"", "goodbye"}},
CharData("\n "),
@@ -74,6 +81,10 @@ var cookedTokens = []Token{
CharData("World <>'\" 白鵬翔"),
EndElement{Name{"ns2", "hello"}},
CharData("\n "),
+ StartElement{Name{"ns2", "query"}, []Attr{}},
+ CharData("What is it?"),
+ EndElement{Name{"ns2", "query"}},
+ CharData("\n "),
StartElement{Name{"ns2", "goodbye"}, []Attr{}},
EndElement{Name{"ns2", "goodbye"}},
CharData("\n "),
@@ -156,6 +167,7 @@ var xmlInput = []string{
func TestRawToken(t *testing.T) {
d := NewDecoder(strings.NewReader(testInput))
+ d.Entity = testEntity
testRawToken(t, d, rawTokens)
}
@@ -164,8 +176,14 @@ const nonStrictInput = `
<tag>&unknown;entity</tag>
<tag>&#123</tag>
<tag>&#zzz;</tag>
+<tag>&なまえ3;</tag>
+<tag>&lt-gt;</tag>
+<tag>&;</tag>
+<tag>&0a;</tag>
`
+var nonStringEntity = map[string]string{"": "oops!", "0a": "oops!"}
+
var nonStrictTokens = []Token{
CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}},
@@ -184,6 +202,22 @@ var nonStrictTokens = []Token{
CharData("&#zzz;"),
EndElement{Name{"", "tag"}},
CharData("\n"),
+ StartElement{Name{"", "tag"}, []Attr{}},
+ CharData("&なまえ3;"),
+ EndElement{Name{"", "tag"}},
+ CharData("\n"),
+ StartElement{Name{"", "tag"}, []Attr{}},
+ CharData("&lt-gt;"),
+ EndElement{Name{"", "tag"}},
+ CharData("\n"),
+ StartElement{Name{"", "tag"}, []Attr{}},
+ CharData("&;"),
+ EndElement{Name{"", "tag"}},
+ CharData("\n"),
+ StartElement{Name{"", "tag"}, []Attr{}},
+ CharData("&0a;"),
+ EndElement{Name{"", "tag"}},
+ CharData("\n"),
}
func TestNonStrictRawToken(t *testing.T) {
@@ -317,6 +351,7 @@ func TestNestedDirectives(t *testing.T) {
func TestToken(t *testing.T) {
d := NewDecoder(strings.NewReader(testInput))
+ d.Entity = testEntity
for i, want := range cookedTokens {
have, err := d.Token()