diff options
Diffstat (limited to 'libgo/go/exp/html/token_test.go')
-rw-r--r-- | libgo/go/exp/html/token_test.go | 133 |
1 files changed, 114 insertions, 19 deletions
diff --git a/libgo/go/exp/html/token_test.go b/libgo/go/exp/html/token_test.go index 61d7400..63a8bfc 100644 --- a/libgo/go/exp/html/token_test.go +++ b/libgo/go/exp/html/token_test.go @@ -7,6 +7,8 @@ package html import ( "bytes" "io" + "io/ioutil" + "runtime" "strings" "testing" ) @@ -126,7 +128,7 @@ var tokenTests = []tokenTest{ { "tag name eof #4", `<a x`, - `<a x="">`, + ``, }, // Some malformed tags that are missing a '>'. { @@ -142,12 +144,12 @@ var tokenTests = []tokenTest{ { "malformed tag #2", `<p id`, - `<p id="">`, + ``, }, { "malformed tag #3", `<p id=`, - `<p id="">`, + ``, }, { "malformed tag #4", @@ -157,7 +159,7 @@ var tokenTests = []tokenTest{ { "malformed tag #5", `<p id=0`, - `<p id="0">`, + ``, }, { "malformed tag #6", @@ -167,13 +169,18 @@ var tokenTests = []tokenTest{ { "malformed tag #7", `<p id="0</p>`, - `<p id="0</p>">`, + ``, }, { "malformed tag #8", `<p id="0"</p>`, `<p id="0" <="" p="">`, }, + { + "malformed tag #9", + `<p></p id`, + `<p>`, + }, // Raw text and RCDATA. { "basic raw text", @@ -203,7 +210,7 @@ var tokenTests = []tokenTest{ { "' ' completes script end tag", "<SCRIPT>a</SCRipt ", - "<script>$a$</script>", + "<script>$a", }, { "'>' completes script end tag", @@ -359,7 +366,7 @@ var tokenTests = []tokenTest{ { "tricky", "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", - `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, + `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, }, // A nonexistent entity. Tokenizing and converting back to a string should // escape the "&" to become "&". @@ -368,14 +375,11 @@ var tokenTests = []tokenTest{ `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, }, - /* - // TODO: re-enable this test when it works. This input/output matches html5lib's behavior. - { - "entity without semicolon", - `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, - `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, - }, - */ + { + "entity without semicolon", + `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, + `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, + }, { "entity with digits", "½", @@ -421,7 +425,7 @@ var tokenTests = []tokenTest{ { "Double-quoted attribute value", `<input value="I'm an attribute" FOO="BAR">`, - `<input value="I'm an attribute" foo="BAR">`, + `<input value="I'm an attribute" foo="BAR">`, }, { "Attribute name characters", @@ -436,7 +440,7 @@ var tokenTests = []tokenTest{ { "Attributes with a solitary single quote", `<p id=can't><p id=won't>`, - `<p id="can't">$<p id="won't">`, + `<p id="can't">$<p id="won't">`, }, } @@ -545,10 +549,11 @@ func TestUnescapeEscape(t *testing.T) { `"<&>"`, `"<&>"`, `3&5==1 && 0<1, "0<1", a+acute=á`, + `The special characters are: <, >, &, ' and "`, } for _, s := range ss { - if s != UnescapeString(EscapeString(s)) { - t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s) + if got := UnescapeString(EscapeString(s)); got != s { + t.Errorf("got %q want %q", got, s) } } } @@ -588,3 +593,93 @@ loop: t.Errorf("TestBufAPI: want %q got %q", u, v) } } + +func TestConvertNewlines(t *testing.T) { + testCases := map[string]string{ + "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", + "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", + "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", + "": "", + "\n": "\n", + "\n\r": "\n\n", + "\r": "\n", + "\r\n": "\n", + "\r\n\n": "\n\n", + "\r\n\r": "\n\n", + "\r\n\r\n": "\n\n", + "\r\r": "\n\n", + "\r\r\n": "\n\n", + "\r\r\n\n": "\n\n\n", + "\r\r\r\n": "\n\n\n", + "\r \n": "\n \n", + "xyz": "xyz", + } + for in, want := range testCases { + if got := string(convertNewlines([]byte(in))); got != want { + t.Errorf("input %q: got %q, want %q", in, got, want) + } + } +} + +const ( + rawLevel = iota + lowLevel + highLevel +) + +func benchmarkTokenizer(b *testing.B, level int) { + buf, err := ioutil.ReadFile("testdata/go1.html") + if err != nil { + b.Fatalf("could not read testdata/go1.html: %v", err) + } + b.SetBytes(int64(len(buf))) + runtime.GC() + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + mallocs := ms.Mallocs + b.ResetTimer() + for i := 0; i < b.N; i++ { + z := NewTokenizer(bytes.NewBuffer(buf)) + for { + tt := z.Next() + if tt == ErrorToken { + if err := z.Err(); err != nil && err != io.EOF { + b.Fatalf("tokenizer error: %v", err) + } + break + } + switch level { + case rawLevel: + // Calling z.Raw just returns the raw bytes of the token. It does + // not unescape < to <, or lower-case tag names and attribute keys. + z.Raw() + case lowLevel: + // Caling z.Text, z.TagName and z.TagAttr returns []byte values + // whose contents may change on the next call to z.Next. + switch tt { + case TextToken, CommentToken, DoctypeToken: + z.Text() + case StartTagToken, SelfClosingTagToken: + _, more := z.TagName() + for more { + _, _, more = z.TagAttr() + } + case EndTagToken: + z.TagName() + } + case highLevel: + // Calling z.Token converts []byte values to strings whose validity + // extend beyond the next call to z.Next. + z.Token() + } + } + } + b.StopTimer() + runtime.ReadMemStats(&ms) + mallocs = ms.Mallocs - mallocs + b.Logf("%d iterations, %d mallocs per iteration\n", b.N, int(mallocs)/b.N) +} + +func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } +func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } +func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) } |