1 files changed, 114 insertions, 19 deletions
diff --git a/libgo/go/exp/html/token_test.go b/libgo/go/exp/html/token_test.go
index 61d7400..63a8bfc 100644
--- a/libgo/go/exp/html/token_test.go
+++ b/libgo/go/exp/html/token_test.go
@@ -7,6 +7,8 @@ package html
 import (
 	"bytes"
 	"io"
+	"io/ioutil"
+	"runtime"
 	"strings"
 	"testing"
 )
@@ -126,7 +128,7 @@ var tokenTests = []tokenTest{
 	{
 		"tag name eof #4",
 		`<a x`,
-		`<a x="">`,
+		``,
 	},
 	// Some malformed tags that are missing a '>'.
 	{
@@ -142,12 +144,12 @@ var tokenTests = []tokenTest{
 	{
 		"malformed tag #2",
 		`<p id`,
-		`<p id="">`,
+		``,
 	},
 	{
 		"malformed tag #3",
 		`<p id=`,
-		`<p id="">`,
+		``,
 	},
 	{
 		"malformed tag #4",
@@ -157,7 +159,7 @@ var tokenTests = []tokenTest{
 	{
 		"malformed tag #5",
 		`<p id=0`,
-		`<p id="0">`,
+		``,
 	},
 	{
 		"malformed tag #6",
@@ -167,13 +169,18 @@ var tokenTests = []tokenTest{
 	{
 		"malformed tag #7",
 		`<p id="0</p>`,
-		`<p id="0&lt;/p&gt;">`,
+		``,
 	},
 	{
 		"malformed tag #8",
 		`<p id="0"</p>`,
 		`<p id="0" <="" p="">`,
 	},
+	{
+		"malformed tag #9",
+		`<p></p id`,
+		`<p>`,
+	},
 	// Raw text and RCDATA.
 	{
 		"basic raw text",
@@ -203,7 +210,7 @@ var tokenTests = []tokenTest{
 	{
 		"' ' completes script end tag",
 		"<SCRIPT>a</SCRipt ",
-		"<script>$a$</script>",
+		"<script>$a",
 	},
 	{
 		"'>' completes script end tag",
@@ -359,7 +366,7 @@ var tokenTests = []tokenTest{
 	{
 		"tricky",
 		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
-		`<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
+		`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 	},
 	// A nonexistent entity. Tokenizing and converting back to a string should
 	// escape the "&" to become "&amp;".
@@ -368,14 +375,11 @@ var tokenTests = []tokenTest{
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
-	/*
-		// TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
-		{
-			"entity without semicolon",
-			`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
-			`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
-		},
-	*/
+	{
+		"entity without semicolon",
+		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+	},
 	{
 		"entity with digits",
 		"&frac12;",
@@ -421,7 +425,7 @@ var tokenTests = []tokenTest{
 	{
 		"Double-quoted attribute value",
 		`<input value="I'm an attribute" FOO="BAR">`,
-		`<input value="I&apos;m an attribute" foo="BAR">`,
+		`<input value="I&#39;m an attribute" foo="BAR">`,
 	},
 	{
 		"Attribute name characters",
@@ -436,7 +440,7 @@ var tokenTests = []tokenTest{
 	{
 		"Attributes with a solitary single quote",
 		`<p id=can't><p id=won't>`,
-		`<p id="can&apos;t">$<p id="won&apos;t">`,
+		`<p id="can&#39;t">$<p id="won&#39;t">`,
 	},
 }
 
@@ -545,10 +549,11 @@ func TestUnescapeEscape(t *testing.T) {
 		`"<&>"`,
 		`&quot;&lt;&amp;&gt;&quot;`,
 		`3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
+		`The special characters are: <, >, &, ' and "`,
 	}
 	for _, s := range ss {
-		if s != UnescapeString(EscapeString(s)) {
-			t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
+		if got := UnescapeString(EscapeString(s)); got != s {
+			t.Errorf("got %q want %q", got, s)
 		}
 	}
 }
@@ -588,3 +593,93 @@ loop:
 		t.Errorf("TestBufAPI: want %q got %q", u, v)
 	}
 }
+
+func TestConvertNewlines(t *testing.T) {
+	testCases := map[string]string{
+		"Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
+		"Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
+		"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
+		"":         "",
+		"\n":       "\n",
+		"\n\r":     "\n\n",
+		"\r":       "\n",
+		"\r\n":     "\n",
+		"\r\n\n":   "\n\n",
+		"\r\n\r":   "\n\n",
+		"\r\n\r\n": "\n\n",
+		"\r\r":     "\n\n",
+		"\r\r\n":   "\n\n",
+		"\r\r\n\n": "\n\n\n",
+		"\r\r\r\n": "\n\n\n",
+		"\r \n":    "\n \n",
+		"xyz":      "xyz",
+	}
+	for in, want := range testCases {
+		if got := string(convertNewlines([]byte(in))); got != want {
+			t.Errorf("input %q: got %q, want %q", in, got, want)
+		}
+	}
+}
+
+const (
+	rawLevel = iota
+	lowLevel
+	highLevel
+)
+
+func benchmarkTokenizer(b *testing.B, level int) {
+	buf, err := ioutil.ReadFile("testdata/go1.html")
+	if err != nil {
+		b.Fatalf("could not read testdata/go1.html: %v", err)
+	}
+	b.SetBytes(int64(len(buf)))
+	runtime.GC()
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	mallocs := ms.Mallocs
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		z := NewTokenizer(bytes.NewBuffer(buf))
+		for {
+			tt := z.Next()
+			if tt == ErrorToken {
+				if err := z.Err(); err != nil && err != io.EOF {
+					b.Fatalf("tokenizer error: %v", err)
+				}
+				break
+			}
+			switch level {
+			case rawLevel:
+				// Calling z.Raw just returns the raw bytes of the token. It does
+				// not unescape &lt; to <, or lower-case tag names and attribute keys.
+				z.Raw()
+			case lowLevel:
+				// Caling z.Text, z.TagName and z.TagAttr returns []byte values
+				// whose contents may change on the next call to z.Next.
+				switch tt {
+				case TextToken, CommentToken, DoctypeToken:
+					z.Text()
+				case StartTagToken, SelfClosingTagToken:
+					_, more := z.TagName()
+					for more {
+						_, _, more = z.TagAttr()
+					}
+				case EndTagToken:
+					z.TagName()
+				}
+			case highLevel:
+				// Calling z.Token converts []byte values to strings whose validity
+				// extend beyond the next call to z.Next.
+				z.Token()
+			}
+		}
+	}
+	b.StopTimer()
+	runtime.ReadMemStats(&ms)
+	mallocs = ms.Mallocs - mallocs
+	b.Logf("%d iterations, %d mallocs per iteration\n", b.N, int(mallocs)/b.N)
+}
+
+func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
+func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
+func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }