aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/exp/html/token_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/exp/html/token_test.go')
-rw-r--r--libgo/go/exp/html/token_test.go133
1 files changed, 114 insertions, 19 deletions
diff --git a/libgo/go/exp/html/token_test.go b/libgo/go/exp/html/token_test.go
index 61d7400..63a8bfc 100644
--- a/libgo/go/exp/html/token_test.go
+++ b/libgo/go/exp/html/token_test.go
@@ -7,6 +7,8 @@ package html
import (
"bytes"
"io"
+ "io/ioutil"
+ "runtime"
"strings"
"testing"
)
@@ -126,7 +128,7 @@ var tokenTests = []tokenTest{
{
"tag name eof #4",
`<a x`,
- `<a x="">`,
+ ``,
},
// Some malformed tags that are missing a '>'.
{
@@ -142,12 +144,12 @@ var tokenTests = []tokenTest{
{
"malformed tag #2",
`<p id`,
- `<p id="">`,
+ ``,
},
{
"malformed tag #3",
`<p id=`,
- `<p id="">`,
+ ``,
},
{
"malformed tag #4",
@@ -157,7 +159,7 @@ var tokenTests = []tokenTest{
{
"malformed tag #5",
`<p id=0`,
- `<p id="0">`,
+ ``,
},
{
"malformed tag #6",
@@ -167,13 +169,18 @@ var tokenTests = []tokenTest{
{
"malformed tag #7",
`<p id="0</p>`,
- `<p id="0&lt;/p&gt;">`,
+ ``,
},
{
"malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
+ {
+ "malformed tag #9",
+ `<p></p id`,
+ `<p>`,
+ },
// Raw text and RCDATA.
{
"basic raw text",
@@ -203,7 +210,7 @@ var tokenTests = []tokenTest{
{
"' ' completes script end tag",
"<SCRIPT>a</SCRipt ",
- "<script>$a$</script>",
+ "<script>$a",
},
{
"'>' completes script end tag",
@@ -359,7 +366,7 @@ var tokenTests = []tokenTest{
{
"tricky",
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
- `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
+ `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
},
// A nonexistent entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;".
@@ -368,14 +375,11 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
- /*
- // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
- {
- "entity without semicolon",
- `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
- `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
- },
- */
+ {
+ "entity without semicolon",
+ `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+ `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+ },
{
"entity with digits",
"&frac12;",
@@ -421,7 +425,7 @@ var tokenTests = []tokenTest{
{
"Double-quoted attribute value",
`<input value="I'm an attribute" FOO="BAR">`,
- `<input value="I&apos;m an attribute" foo="BAR">`,
+ `<input value="I&#39;m an attribute" foo="BAR">`,
},
{
"Attribute name characters",
@@ -436,7 +440,7 @@ var tokenTests = []tokenTest{
{
"Attributes with a solitary single quote",
`<p id=can't><p id=won't>`,
- `<p id="can&apos;t">$<p id="won&apos;t">`,
+ `<p id="can&#39;t">$<p id="won&#39;t">`,
},
}
@@ -545,10 +549,11 @@ func TestUnescapeEscape(t *testing.T) {
`"<&>"`,
`&quot;&lt;&amp;&gt;&quot;`,
`3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
+ `The special characters are: <, >, &, ' and "`,
}
for _, s := range ss {
- if s != UnescapeString(EscapeString(s)) {
- t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
+ if got := UnescapeString(EscapeString(s)); got != s {
+ t.Errorf("got %q want %q", got, s)
}
}
}
@@ -588,3 +593,93 @@ loop:
t.Errorf("TestBufAPI: want %q got %q", u, v)
}
}
+
+func TestConvertNewlines(t *testing.T) {
+ testCases := map[string]string{
+ "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
+ "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
+ "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
+ "": "",
+ "\n": "\n",
+ "\n\r": "\n\n",
+ "\r": "\n",
+ "\r\n": "\n",
+ "\r\n\n": "\n\n",
+ "\r\n\r": "\n\n",
+ "\r\n\r\n": "\n\n",
+ "\r\r": "\n\n",
+ "\r\r\n": "\n\n",
+ "\r\r\n\n": "\n\n\n",
+ "\r\r\r\n": "\n\n\n",
+ "\r \n": "\n \n",
+ "xyz": "xyz",
+ }
+ for in, want := range testCases {
+ if got := string(convertNewlines([]byte(in))); got != want {
+ t.Errorf("input %q: got %q, want %q", in, got, want)
+ }
+ }
+}
+
+const (
+ rawLevel = iota
+ lowLevel
+ highLevel
+)
+
+func benchmarkTokenizer(b *testing.B, level int) {
+ buf, err := ioutil.ReadFile("testdata/go1.html")
+ if err != nil {
+ b.Fatalf("could not read testdata/go1.html: %v", err)
+ }
+ b.SetBytes(int64(len(buf)))
+ runtime.GC()
+ var ms runtime.MemStats
+ runtime.ReadMemStats(&ms)
+ mallocs := ms.Mallocs
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ z := NewTokenizer(bytes.NewBuffer(buf))
+ for {
+ tt := z.Next()
+ if tt == ErrorToken {
+ if err := z.Err(); err != nil && err != io.EOF {
+ b.Fatalf("tokenizer error: %v", err)
+ }
+ break
+ }
+ switch level {
+ case rawLevel:
+ // Calling z.Raw just returns the raw bytes of the token. It does
+ // not unescape &lt; to <, or lower-case tag names and attribute keys.
+ z.Raw()
+ case lowLevel:
+ // Caling z.Text, z.TagName and z.TagAttr returns []byte values
+ // whose contents may change on the next call to z.Next.
+ switch tt {
+ case TextToken, CommentToken, DoctypeToken:
+ z.Text()
+ case StartTagToken, SelfClosingTagToken:
+ _, more := z.TagName()
+ for more {
+ _, _, more = z.TagAttr()
+ }
+ case EndTagToken:
+ z.TagName()
+ }
+ case highLevel:
+ // Calling z.Token converts []byte values to strings whose validity
+ // extend beyond the next call to z.Next.
+ z.Token()
+ }
+ }
+ }
+ b.StopTimer()
+ runtime.ReadMemStats(&ms)
+ mallocs = ms.Mallocs - mallocs
+ b.Logf("%d iterations, %d mallocs per iteration\n", b.N, int(mallocs)/b.N)
+}
+
+func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
+func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
+func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }