diff options
Diffstat (limited to 'libgo/go/html')
25 files changed, 1888 insertions, 446 deletions
diff --git a/libgo/go/html/const.go b/libgo/go/html/const.go new file mode 100644 index 0000000..9078d26 --- /dev/null +++ b/libgo/go/html/const.go @@ -0,0 +1,90 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +// Section 11.2.3.2 of the HTML5 specification says "The following elements +// have varying levels of special parsing rules". +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-stack-of-open-elements +var isSpecialElement = map[string]bool{ + "address": true, + "applet": true, + "area": true, + "article": true, + "aside": true, + "base": true, + "basefont": true, + "bgsound": true, + "blockquote": true, + "body": true, + "br": true, + "button": true, + "caption": true, + "center": true, + "col": true, + "colgroup": true, + "command": true, + "dd": true, + "details": true, + "dir": true, + "div": true, + "dl": true, + "dt": true, + "embed": true, + "fieldset": true, + "figcaption": true, + "figure": true, + "footer": true, + "form": true, + "frame": true, + "frameset": true, + "h1": true, + "h2": true, + "h3": true, + "h4": true, + "h5": true, + "h6": true, + "head": true, + "header": true, + "hgroup": true, + "hr": true, + "html": true, + "iframe": true, + "img": true, + "input": true, + "isindex": true, + "li": true, + "link": true, + "listing": true, + "marquee": true, + "menu": true, + "meta": true, + "nav": true, + "noembed": true, + "noframes": true, + "noscript": true, + "object": true, + "ol": true, + "p": true, + "param": true, + "plaintext": true, + "pre": true, + "script": true, + "section": true, + "select": true, + "style": true, + "summary": true, + "table": true, + "tbody": true, + "td": true, + "textarea": true, + "tfoot": true, + "th": true, + "thead": true, + "title": true, + "tr": true, + "ul": true, + "wbr": true, + "xmp": true, +} diff --git a/libgo/go/html/doc.go b/libgo/go/html/doc.go index 55135c3..5bc0630 100644 --- a/libgo/go/html/doc.go +++ b/libgo/go/html/doc.go @@ -4,6 +4,7 @@ /* Package html implements an HTML5-compliant tokenizer and parser. +INCOMPLETE. Tokenization is done by creating a Tokenizer for an io.Reader r. It is the caller's responsibility to ensure that r provides UTF-8 encoded HTML. diff --git a/libgo/go/html/entity.go b/libgo/go/html/entity.go index 1530290..21263e2 100644 --- a/libgo/go/html/entity.go +++ b/libgo/go/html/entity.go @@ -4,6 +4,9 @@ package html +// All entities that do not end with ';' are 6 or fewer bytes long. +const longestEntityWithoutSemicolon = 6 + // entity is a map from HTML entity names to their values. The semicolon matters: // http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html // lists both "amp" and "amp;" as two separate entries. diff --git a/libgo/go/html/entity_test.go b/libgo/go/html/entity_test.go index a1eb4d4..2cf49d6 100644 --- a/libgo/go/html/entity_test.go +++ b/libgo/go/html/entity_test.go @@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) { if 1+len(k) < utf8.RuneLen(v) { t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v)) } + if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' { + t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon) + } } for k, v := range entity2 { if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) { diff --git a/libgo/go/html/escape.go b/libgo/go/html/escape.go index 2799f69..0de97c5 100644 --- a/libgo/go/html/escape.go +++ b/libgo/go/html/escape.go @@ -53,7 +53,8 @@ var replacementTable = [...]int{ // unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. // Precondition: b[src] == '&' && dst <= src. -func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { +// attribute should be true if parsing an attribute value. +func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // i starts at 1 because we already know that s[0] == '&'. @@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { // Consume the maximum number of characters possible, with the // consumed characters matching one of the named references. - // TODO(nigeltao): unescape("¬it;") should be "¬it;" for i < len(s) { c := s[i] i++ // Lower-cased characters are more common in entities, so we check for them first. - if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { continue } if c != ';' { @@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { } entityName := string(s[1:i]) - if x := entity[entityName]; x != 0 { + if entityName == "" { + // No-op. + } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { + // No-op. + } else if x := entity[entityName]; x != 0 { return dst + utf8.EncodeRune(b[dst:], x), src + i - } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity. + } else if x := entity2[entityName]; x[0] != 0 { dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } else if !attribute { + maxLen := len(entityName) - 1 + if maxLen > longestEntityWithoutSemicolon { + maxLen = longestEntityWithoutSemicolon + } + for j := maxLen; j > 1; j-- { + if x := entity[entityName[:j]]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 + } + } } dst1, src1 = dst+i, src+i @@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { func unescape(b []byte) []byte { for i, c := range b { if c == '&' { - dst, src := unescapeEntity(b, i, i) + dst, src := unescapeEntity(b, i, i, false) for src < len(b) { c := b[src] if c == '&' { - dst, src = unescapeEntity(b, dst, src) + dst, src = unescapeEntity(b, dst, src, false) } else { b[dst] = c dst, src = dst+1, src+1 diff --git a/libgo/go/html/node.go b/libgo/go/html/node.go new file mode 100644 index 0000000..4ecfd6c --- /dev/null +++ b/libgo/go/html/node.go @@ -0,0 +1,147 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +// A NodeType is the type of a Node. +type NodeType int + +const ( + ErrorNode NodeType = iota + TextNode + DocumentNode + ElementNode + CommentNode + DoctypeNode + scopeMarkerNode +) + +// Section 11.2.3.3 says "scope markers are inserted when entering applet +// elements, buttons, object elements, marquees, table cells, and table +// captions, and are used to prevent formatting from 'leaking'". +var scopeMarker = Node{Type: scopeMarkerNode} + +// A Node consists of a NodeType and some Data (tag name for element nodes, +// content for text) and are part of a tree of Nodes. Element nodes may also +// contain a slice of Attributes. Data is unescaped, so that it looks like +// "a<b" rather than "a<b". +type Node struct { + Parent *Node + Child []*Node + Type NodeType + Data string + Attr []Attribute +} + +// Add adds a node as a child of n. +// It will panic if the child's parent is not nil. +func (n *Node) Add(child *Node) { + if child.Parent != nil { + panic("html: Node.Add called for a child Node that already has a parent") + } + child.Parent = n + n.Child = append(n.Child, child) +} + +// Remove removes a node as a child of n. +// It will panic if the child's parent is not n. +func (n *Node) Remove(child *Node) { + if child.Parent == n { + child.Parent = nil + for i, m := range n.Child { + if m == child { + copy(n.Child[i:], n.Child[i+1:]) + j := len(n.Child) - 1 + n.Child[j] = nil + n.Child = n.Child[:j] + return + } + } + } + panic("html: Node.Remove called for a non-child Node") +} + +// reparentChildren reparents all of src's child nodes to dst. +func reparentChildren(dst, src *Node) { + for _, n := range src.Child { + if n.Parent != src { + panic("html: nodes have an inconsistent parent/child relationship") + } + n.Parent = dst + } + dst.Child = append(dst.Child, src.Child...) + src.Child = nil +} + +// clone returns a new node with the same type, data and attributes. +// The clone has no parent and no children. +func (n *Node) clone() *Node { + m := &Node{ + Type: n.Type, + Data: n.Data, + Attr: make([]Attribute, len(n.Attr)), + } + copy(m.Attr, n.Attr) + return m +} + +// nodeStack is a stack of nodes. +type nodeStack []*Node + +// pop pops the stack. It will panic if s is empty. +func (s *nodeStack) pop() *Node { + i := len(*s) + n := (*s)[i-1] + *s = (*s)[:i-1] + return n +} + +// top returns the most recently pushed node, or nil if s is empty. +func (s *nodeStack) top() *Node { + if i := len(*s); i > 0 { + return (*s)[i-1] + } + return nil +} + +// index returns the index of the top-most occurence of n in the stack, or -1 +// if n is not present. +func (s *nodeStack) index(n *Node) int { + for i := len(*s) - 1; i >= 0; i-- { + if (*s)[i] == n { + return i + } + } + return -1 +} + +// insert inserts a node at the given index. +func (s *nodeStack) insert(i int, n *Node) { + (*s) = append(*s, nil) + copy((*s)[i+1:], (*s)[i:]) + (*s)[i] = n +} + +// remove removes a node from the stack. It is a no-op if n is not present. +func (s *nodeStack) remove(n *Node) { + i := s.index(n) + if i == -1 { + return + } + copy((*s)[i:], (*s)[i+1:]) + j := len(*s) - 1 + (*s)[j] = nil + *s = (*s)[:j] +} + +// forTag returns the top-most element node with the given tag. +func (s *nodeStack) forTag(tag string) *Node { + for i := len(*s) - 1; i >= 0; i-- { + n := (*s)[i] + if n.Type == ElementNode && n.Data == tag { + return n + } + } + return nil +} diff --git a/libgo/go/html/parse.go b/libgo/go/html/parse.go index 2ef90a8..519ebe5 100644 --- a/libgo/go/html/parse.go +++ b/libgo/go/html/parse.go @@ -9,29 +9,6 @@ import ( "os" ) -// A NodeType is the type of a Node. -type NodeType int - -const ( - ErrorNode NodeType = iota - TextNode - DocumentNode - ElementNode - CommentNode -) - -// A Node consists of a NodeType and some Data (tag name for element nodes, -// content for text) and are part of a tree of Nodes. Element nodes may also -// contain a slice of Attributes. Data is unescaped, so that it looks like -// "a<b" rather than "a<b". -type Node struct { - Parent *Node - Child []*Node - Type NodeType - Data string - Attr []Attribute -} - // A parser implements the HTML5 parsing algorithm: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction type parser struct { @@ -45,38 +22,23 @@ type parser struct { hasSelfClosingToken bool // doc is the document root element. doc *Node - // The stack of open elements (section 10.2.3.2). - stack []*Node - // Element pointers (section 10.2.3.4). + // The stack of open elements (section 11.2.3.2) and active formatting + // elements (section 11.2.3.3). + oe, afe nodeStack + // Element pointers (section 11.2.3.4). head, form *Node - // Other parsing state flags (section 10.2.3.5). + // Other parsing state flags (section 11.2.3.5). scripting, framesetOK bool } -// push pushes onto the stack of open elements. -func (p *parser) push(n *Node) { - p.stack = append(p.stack, n) -} - -// top returns the top of the stack of open elements. -// This is also known as the current node. func (p *parser) top() *Node { - if n := len(p.stack); n > 0 { - return p.stack[n-1] + if n := p.oe.top(); n != nil { + return n } return p.doc } -// pop pops the top of the stack of open elements. -// It will panic if the stack is empty. -func (p *parser) pop() *Node { - n := len(p.stack) - ret := p.stack[n-1] - p.stack = p.stack[:n-1] - return ret -} - -// stopTags for use in popUntil. These come from section 10.2.3.2. +// stopTags for use in popUntil. These come from section 11.2.3.2. var ( defaultScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object"} listItemScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "ol", "ul"} @@ -102,11 +64,11 @@ var ( // popUntil([]string{"html, "table"}, "table") would return true and leave: // ["html", "body", "font"] func (p *parser) popUntil(stopTags []string, matchTags ...string) bool { - for i := len(p.stack) - 1; i >= 0; i-- { - tag := p.stack[i].Data + for i := len(p.oe) - 1; i >= 0; i-- { + tag := p.oe[i].Data for _, t := range matchTags { if t == tag { - p.stack = p.stack[:i] + p.oe = p.oe[:i] return true } } @@ -119,20 +81,24 @@ func (p *parser) popUntil(stopTags []string, matchTags ...string) bool { return false } -// addChild adds a child node n to the top element, and pushes n if it is an -// element node (text nodes are not part of the stack of open elements). +// addChild adds a child node n to the top element, and pushes n onto the stack +// of open elements if it is an element node. func (p *parser) addChild(n *Node) { - m := p.top() - m.Child = append(m.Child, n) + p.top().Add(n) if n.Type == ElementNode { - p.push(n) + p.oe = append(p.oe, n) } } -// addText calls addChild with a text node. +// addText adds text to the preceding node if it is a text node, or else it +// calls addChild with a new text node. func (p *parser) addText(text string) { - // TODO: merge s with previous text, if the preceding node is a text node. // TODO: distinguish whitespace text from others. + t := p.top() + if i := len(t.Child); i > 0 && t.Child[i-1].Type == TextNode { + t.Child[i-1].Data += text + return + } p.addChild(&Node{ Type: TextNode, Data: text, @@ -148,15 +114,50 @@ func (p *parser) addElement(tag string, attr []Attribute) { }) } -// Section 10.2.3.3. +// Section 11.2.3.3. func (p *parser) addFormattingElement(tag string, attr []Attribute) { p.addElement(tag, attr) + p.afe = append(p.afe, p.top()) // TODO. } -// Section 10.2.3.3. +// Section 11.2.3.3. +func (p *parser) clearActiveFormattingElements() { + for { + n := p.afe.pop() + if len(p.afe) == 0 || n.Type == scopeMarkerNode { + return + } + } +} + +// Section 11.2.3.3. func (p *parser) reconstructActiveFormattingElements() { - // TODO. + n := p.afe.top() + if n == nil { + return + } + if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { + return + } + i := len(p.afe) - 1 + for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { + if i == 0 { + i = -1 + break + } + i-- + n = p.afe[i] + } + for { + i++ + n = p.afe[i] + p.addChild(n.clone()) + p.afe[i] = n + if i == len(p.afe)-1 { + break + } + } } // read reads the next token. This is usually from the tokenizer, but it may @@ -180,12 +181,12 @@ func (p *parser) read() os.Error { return nil } -// Section 10.2.4. +// Section 11.2.4. func (p *parser) acknowledgeSelfClosingTag() { p.hasSelfClosingToken = false } -// An insertion mode (section 10.2.3.1) is the state transition function from +// An insertion mode (section 11.2.3.1) is the state transition function from // a particular state in the HTML5 parser's state machine. It updates the // parser's fields depending on parser.token (where ErrorToken means EOF). In // addition to returning the next insertionMode state, it also returns whether @@ -194,7 +195,7 @@ type insertionMode func(*parser) (insertionMode, bool) // useTheRulesFor runs the delegate insertionMode over p, returning the actual // insertionMode unless the delegate caused a state transition. -// Section 10.2.3.1, "using the rules for". +// Section 11.2.3.1, "using the rules for". func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) { im, consumed := delegate(p) if im != delegate { @@ -203,13 +204,21 @@ func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, b return actual, consumed } -// Section 10.2.5.4. +// Section 11.2.5.4.1. func initialIM(p *parser) (insertionMode, bool) { - // TODO: check p.tok for DOCTYPE. + if p.tok.Type == DoctypeToken { + p.addChild(&Node{ + Type: DoctypeNode, + Data: p.tok.Data, + }) + return beforeHTMLIM, true + } + // TODO: set "quirks mode"? It's defined in the DOM spec instead of HTML5 proper, + // and so switching on "quirks mode" might belong in a different package. return beforeHTMLIM, false } -// Section 10.2.5.5. +// Section 11.2.5.4.2. func beforeHTMLIM(p *parser) (insertionMode, bool) { var ( add bool @@ -243,7 +252,7 @@ func beforeHTMLIM(p *parser) (insertionMode, bool) { return beforeHeadIM, !implied } -// Section 10.2.5.6. +// Section 11.2.5.4.3. func beforeHeadIM(p *parser) (insertionMode, bool) { var ( add bool @@ -280,7 +289,7 @@ func beforeHeadIM(p *parser) (insertionMode, bool) { return inHeadIM, !implied } -// Section 10.2.5.7. +// Section 11.2.5.4.4. func inHeadIM(p *parser) (insertionMode, bool) { var ( pop bool @@ -305,7 +314,7 @@ func inHeadIM(p *parser) (insertionMode, bool) { // TODO. } if pop || implied { - n := p.pop() + n := p.oe.pop() if n.Data != "head" { panic("html: bad parser state") } @@ -314,7 +323,7 @@ func inHeadIM(p *parser) (insertionMode, bool) { return inHeadIM, !implied } -// Section 10.2.5.9. +// Section 11.2.5.4.6. func afterHeadIM(p *parser) (insertionMode, bool) { var ( add bool @@ -354,17 +363,18 @@ func afterHeadIM(p *parser) (insertionMode, bool) { return inBodyIM, !implied } -// Section 10.2.5.10. +// Section 11.2.5.4.7. func inBodyIM(p *parser) (insertionMode, bool) { var endP bool switch p.tok.Type { case TextToken: + p.reconstructActiveFormattingElements() p.addText(p.tok.Data) p.framesetOK = false case StartTagToken: switch p.tok.Data { case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": - // TODO: Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2. + // TODO: Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 11.2.3.2. n := p.top() if n.Type == ElementNode && n.Data == "p" { endP = true @@ -375,16 +385,24 @@ func inBodyIM(p *parser) (insertionMode, bool) { // TODO: auto-insert </p> if necessary. switch n := p.top(); n.Data { case "h1", "h2", "h3", "h4", "h5", "h6": - p.pop() + p.oe.pop() } p.addElement(p.tok.Data, p.tok.Attr) + case "a": + if n := p.afe.forTag("a"); n != nil { + p.inBodyEndTagFormatting("a") + p.oe.remove(n) + p.afe.remove(n) + } + p.reconstructActiveFormattingElements() + p.addFormattingElement(p.tok.Data, p.tok.Attr) case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": p.reconstructActiveFormattingElements() p.addFormattingElement(p.tok.Data, p.tok.Attr) case "area", "br", "embed", "img", "input", "keygen", "wbr": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) - p.pop() + p.oe.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false case "table": @@ -395,11 +413,12 @@ func inBodyIM(p *parser) (insertionMode, bool) { case "hr": // TODO: auto-insert </p> if necessary. p.addElement(p.tok.Data, p.tok.Attr) - p.pop() + p.oe.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false default: // TODO. + p.addElement(p.tok.Data, p.tok.Attr) } case EndTagToken: switch p.tok.Data { @@ -407,18 +426,17 @@ func inBodyIM(p *parser) (insertionMode, bool) { // TODO: autoclose the stack of open elements. return afterBodyIM, true case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": - // TODO: implement the "adoption agency" algorithm: - // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency + p.inBodyEndTagFormatting(p.tok.Data) + default: + // TODO: any other end tag if p.tok.Data == p.top().Data { - p.pop() + p.oe.pop() } - default: - // TODO. } } if endP { // TODO: do the proper algorithm. - n := p.pop() + n := p.oe.pop() if n.Type != ElementNode || n.Data != "p" { panic("unreachable") } @@ -426,7 +444,123 @@ func inBodyIM(p *parser) (insertionMode, bool) { return inBodyIM, !endP } -// Section 10.2.5.12. +func (p *parser) inBodyEndTagFormatting(tag string) { + // This is the "adoption agency" algorithm, described at + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency + + // TODO: this is a fairly literal line-by-line translation of that algorithm. + // Once the code successfully parses the comprehensive test suite, we should + // refactor this code to be more idiomatic. + + // Steps 1-3. The outer loop. + for i := 0; i < 8; i++ { + // Step 4. Find the formatting element. + var formattingElement *Node + for j := len(p.afe) - 1; j >= 0; j-- { + if p.afe[j].Type == scopeMarkerNode { + break + } + if p.afe[j].Data == tag { + formattingElement = p.afe[j] + break + } + } + if formattingElement == nil { + return + } + feIndex := p.oe.index(formattingElement) + if feIndex == -1 { + p.afe.remove(formattingElement) + return + } + + // Steps 5-6. Find the furthest block. + var furthestBlock *Node + for _, e := range p.oe[feIndex:] { + if isSpecialElement[e.Data] { + furthestBlock = e + break + } + } + if furthestBlock == nil { + e := p.oe.pop() + for e != formattingElement { + e = p.oe.pop() + } + p.afe.remove(e) + return + } + + // Steps 7-8. Find the common ancestor and bookmark node. + commonAncestor := p.oe[feIndex-1] + bookmark := p.afe.index(formattingElement) + + // Step 9. The inner loop. Find the lastNode to reparent. + lastNode := furthestBlock + node := furthestBlock + x := p.oe.index(node) + // Steps 9.1-9.3. + for j := 0; j < 3; j++ { + // Step 9.4. + x-- + node = p.oe[x] + // Step 9.5. + if p.afe.index(node) == -1 { + p.oe.remove(node) + continue + } + // Step 9.6. + if node == formattingElement { + break + } + // Step 9.7. + clone := node.clone() + p.afe[p.afe.index(node)] = clone + p.oe[p.oe.index(node)] = clone + node = clone + // Step 9.8. + if lastNode == furthestBlock { + bookmark = p.afe.index(node) + 1 + } + // Step 9.9. + if lastNode.Parent != nil { + lastNode.Parent.Remove(lastNode) + } + node.Add(lastNode) + // Step 9.10. + lastNode = node + } + + // Step 10. Reparent lastNode to the common ancestor, + // or for misnested table nodes, to the foster parent. + if lastNode.Parent != nil { + lastNode.Parent.Remove(lastNode) + } + switch commonAncestor.Data { + case "table", "tbody", "tfoot", "thead", "tr": + // TODO: fix up misnested table nodes; find the foster parent. + fallthrough + default: + commonAncestor.Add(lastNode) + } + + // Steps 11-13. Reparent nodes from the furthest block's children + // to a clone of the formatting element. + clone := formattingElement.clone() + reparentChildren(clone, furthestBlock) + furthestBlock.Add(clone) + + // Step 14. Fix up the list of active formatting elements. + p.afe.remove(formattingElement) + p.afe.insert(bookmark, clone) + + // Step 15. Fix up the stack of open elements. + p.oe.remove(formattingElement) + p.oe.insert(p.oe.index(furthestBlock)+1, clone) + } +} + +// Section 11.2.5.4.9. func inTableIM(p *parser) (insertionMode, bool) { var ( add bool @@ -457,7 +591,7 @@ func inTableIM(p *parser) (insertionMode, bool) { switch p.tok.Data { case "table": if p.popUntil(tableScopeStopTags, "table") { - // TODO: "reset the insertion mode appropriately" as per 10.2.3.1. + // TODO: "reset the insertion mode appropriately" as per 11.2.3.1. return inBodyIM, false } // Ignore the token. @@ -476,7 +610,7 @@ func inTableIM(p *parser) (insertionMode, bool) { return inTableIM, true } -// Section 10.2.5.16. +// Section 11.2.5.4.13. func inTableBodyIM(p *parser) (insertionMode, bool) { var ( add bool @@ -524,7 +658,7 @@ func inTableBodyIM(p *parser) (insertionMode, bool) { return useTheRulesFor(p, inTableBodyIM, inTableIM) } -// Section 10.2.5.17. +// Section 11.2.5.4.14. func inRowIM(p *parser) (insertionMode, bool) { switch p.tok.Type { case ErrorToken: @@ -536,7 +670,7 @@ func inRowIM(p *parser) (insertionMode, bool) { case "td", "th": // TODO: clear the stack back to a table row context. p.addElement(p.tok.Data, p.tok.Attr) - // TODO: insert a marker at the end of the list of active formatting elements. + p.afe = append(p.afe, &scopeMarker) return inCellIM, true default: // TODO. @@ -563,7 +697,7 @@ func inRowIM(p *parser) (insertionMode, bool) { return useTheRulesFor(p, inRowIM, inTableIM) } -// Section 10.2.5.18. +// Section 11.2.5.4.15. func inCellIM(p *parser) (insertionMode, bool) { var ( closeTheCellAndReprocess bool @@ -588,14 +722,14 @@ func inCellIM(p *parser) (insertionMode, bool) { } if closeTheCellAndReprocess { if p.popUntil(tableScopeStopTags, "td") || p.popUntil(tableScopeStopTags, "th") { - // TODO: clear the list of active formatting elements up to the last marker. + p.clearActiveFormattingElements() return inRowIM, false } } return useTheRulesFor(p, inCellIM, inBodyIM) } -// Section 10.2.5.22. +// Section 11.2.5.4.18. func afterBodyIM(p *parser) (insertionMode, bool) { switch p.tok.Type { case ErrorToken: @@ -616,7 +750,7 @@ func afterBodyIM(p *parser) (insertionMode, bool) { return afterBodyIM, true } -// Section 10.2.5.25. +// Section 11.2.5.4.21. func afterAfterBodyIM(p *parser) (insertionMode, bool) { switch p.tok.Type { case ErrorToken: diff --git a/libgo/go/html/parse_test.go b/libgo/go/html/parse_test.go index 3fa35d5..7d918d2 100644 --- a/libgo/go/html/parse_test.go +++ b/libgo/go/html/parse_test.go @@ -85,6 +85,10 @@ func dumpLevel(w io.Writer, n *Node, level int) os.Error { fmt.Fprintf(w, "%q", EscapeString(n.Data)) case CommentNode: return os.NewError("COMMENT") + case DoctypeNode: + fmt.Fprintf(w, "<!DOCTYPE %s>", EscapeString(n.Data)) + case scopeMarkerNode: + return os.NewError("unexpected scopeMarkerNode") default: return os.NewError("unknown node type") } @@ -119,7 +123,7 @@ func TestParser(t *testing.T) { rc := make(chan io.Reader) go readDat(filename, rc) // TODO(nigeltao): Process all test cases, not just a subset. - for i := 0; i < 22; i++ { + for i := 0; i < 25; i++ { // Parse the #data section. b, err := ioutil.ReadAll(<-rc) if err != nil { diff --git a/libgo/go/html/testdata/webkit/comments01.dat b/libgo/go/html/testdata/webkit/comments01.dat index 388d952..44f1876 100644 --- a/libgo/go/html/testdata/webkit/comments01.dat +++ b/libgo/go/html/testdata/webkit/comments01.dat @@ -28,8 +28,7 @@ FOO<!-- BAR -- >BAZ | <head> | <body> | "FOO" -| <!-- BAR -- --> -| "BAZ" +| <!-- BAR -- >BAZ --> #data FOO<!-- BAR -- <QUX> -- MUX -->BAZ @@ -61,8 +60,7 @@ FOO<!-- BAR -- <QUX> -- MUX -- >BAZ | <head> | <body> | "FOO" -| <!-- BAR -- <QUX> -- MUX -- --> -| "BAZ" +| <!-- BAR -- <QUX> -- MUX -- >BAZ --> #data FOO<!---->BAZ @@ -124,3 +122,14 @@ FOO<!-->BAZ | <html> | <head> | <body> + +#data +FOO<!----->BAZ +#errors +#document +| <html> +| <head> +| <body> +| "FOO" +| <!-- - --> +| "BAZ" diff --git a/libgo/go/html/testdata/webkit/doctype01.dat b/libgo/go/html/testdata/webkit/doctype01.dat index 575129c..ae45732 100644 --- a/libgo/go/html/testdata/webkit/doctype01.dat +++ b/libgo/go/html/testdata/webkit/doctype01.dat @@ -132,7 +132,7 @@ <!DOCTYPE potato SYSTEM 'taco"'>Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "" "taco""> | <html> | <head> | <body> @@ -142,7 +142,7 @@ <!DOCTYPE potato SYSTEM "taco">Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "" "taco"> | <html> | <head> | <body> @@ -152,7 +152,7 @@ <!DOCTYPE potato SYSTEM "tai'co">Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "" "tai'co"> | <html> | <head> | <body> @@ -222,7 +222,7 @@ <!DOCTYPE potato PUBLIC "go'of">Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "go'of" ""> | <html> | <head> | <body> @@ -232,7 +232,7 @@ <!DOCTYPE potato PUBLIC 'go'of'>Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "go" ""> | <html> | <head> | <body> @@ -242,7 +242,7 @@ <!DOCTYPE potato PUBLIC 'go:hh of' >Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "go:hh of" ""> | <html> | <head> | <body> @@ -252,7 +252,7 @@ <!DOCTYPE potato PUBLIC "W3C-//dfdf" SYSTEM ggg>Hello #errors #document -| <!DOCTYPE potato> +| <!DOCTYPE potato "W3C-//dfdf" ""> | <html> | <head> | <body> @@ -263,7 +263,7 @@ "http://www.w3.org/TR/html4/strict.dtd">Hello #errors #document -| <!DOCTYPE html> +| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> | <html> | <head> | <body> @@ -284,7 +284,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> #errors #document -| <!DOCTYPE html> +| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | <html> | <head> | <body> @@ -294,7 +294,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"> #errors #document -| <!DOCTYPE html> +| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"> | <html> | <head> | <body> @@ -309,8 +309,7 @@ | <html> | <head> | <body> -| " -]>" +| "]>" #data <!DOCTYPE html PUBLIC @@ -318,7 +317,7 @@ "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"> #errors #document -| <!DOCTYPE html> +| <!DOCTYPE html "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"> | <html> | <head> | <body> @@ -327,9 +326,45 @@ <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"><body><b>Mine!</b></body> #errors #document -| <!DOCTYPE html> +| <!DOCTYPE html "" "http://www.w3.org/DTD/HTML4-strict.dtd"> | <html> | <head> | <body> | <b> | "Mine!" + +#data +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd"> +#errors +#document +| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'> +#errors +#document +| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'> +#errors +#document +| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'> +#errors +#document +| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +| <html> +| <head> +| <body> diff --git a/libgo/go/html/testdata/webkit/dom2string.js b/libgo/go/html/testdata/webkit/dom2string.js deleted file mode 100644 index 45897fd..0000000 --- a/libgo/go/html/testdata/webkit/dom2string.js +++ /dev/null @@ -1,135 +0,0 @@ -String.prototype.toAsciiLowerCase = function () { - var output = ""; - for (var i = 0, len = this.length; i < len; ++i) { - if (this.charCodeAt(i) >= 0x41 && this.charCodeAt(i) <= 0x5A) { - output += String.fromCharCode(this.charCodeAt(i) + 0x20) - } else { - output += this.charAt(i); - } - } - return output; -} - -function indent(ancestors) { - var str = ""; - if (ancestors > 0) { - while (ancestors--) - str += " "; - } - return str; -} - -function dom2string(node, ancestors) { - var str = ""; - if (typeof ancestors == "undefined") - var ancestors = 0; - if (!node.firstChild) - return "| "; - var parent = node; - var current = node.firstChild; - var next = null; - var misnested = null; - for (;;) { - str += "\n| " + indent(ancestors); - switch (current.nodeType) { - case 10: - str += '<!DOCTYPE ' + current.nodeName + '>'; - break; - case 8: - try { - str += '<!-- ' + current.nodeValue + ' -->'; - } catch (e) { - str += '<!-- -->'; - } - if (parent != current.parentNode) { - return str += ' (misnested... aborting)'; - } - break; - case 7: - str += '<?' + current.nodeName + current.nodeValue + '>'; - break; - case 4: - str += '<![CDATA[ ' + current.nodeValue + ' ]]>'; - break; - case 3: - str += '"' + current.nodeValue + '"'; - if (parent != current.parentNode) { - return str += ' (misnested... aborting)'; - } - break; - case 1: - str += "<"; - switch (current.namespaceURI) { - case "http://www.w3.org/2000/svg": - str += "svg "; - break; - case "http://www.w3.org/1998/Math/MathML": - str += "math "; - break; - } - if (current.localName && current.namespaceURI && current.namespaceURI != null) { - str += current.localName; - } else { - str += current.nodeName.toAsciiLowerCase(); - } - str += '>'; - if (parent != current.parentNode) { - return str += ' (misnested... aborting)'; - } else { - if (current.attributes) { - var attrNames = []; - var attrPos = {}; - for (var j = 0; j < current.attributes.length; j += 1) { - if (current.attributes[j].specified) { - var name = ""; - switch (current.attributes[j].namespaceURI) { - case "http://www.w3.org/XML/1998/namespace": - name += "xml "; - break; - case "http://www.w3.org/2000/xmlns/": - name += "xmlns "; - break; - case "http://www.w3.org/1999/xlink": - name += "xlink "; - break; - } - if (current.attributes[j].localName) { - name += current.attributes[j].localName; - } else { - name += current.attributes[j].nodeName; - } - attrNames.push(name); - attrPos[name] = j; - } - } - if (attrNames.length > 0) { - attrNames.sort(); - for (var j = 0; j < attrNames.length; j += 1) { - str += "\n| " + indent(1 + ancestors) + attrNames[j]; - str += '="' + current.attributes[attrPos[attrNames[j]]].nodeValue + '"'; - } - } - } - if (next = current.firstChild) { - parent = current; - current = next; - ancestors++; - continue; - } - } - break; - } - for (;;) { - if (next = current.nextSibling) { - current = next; - break; - } - current = current.parentNode; - parent = parent.parentNode; - ancestors--; - if (current == node) { - return str.substring(1); - } - } - } -} diff --git a/libgo/go/html/testdata/webkit/entities01.dat b/libgo/go/html/testdata/webkit/entities01.dat index 926642e..c8073b7 100644 --- a/libgo/go/html/testdata/webkit/entities01.dat +++ b/libgo/go/html/testdata/webkit/entities01.dat @@ -189,15 +189,6 @@ FOO�ZOO | "FOO�ZOO" #data -FOO
ZOO -#errors -#document -| <html> -| <head> -| <body> -| "FOO
ZOO" - -#data FOOxZOO #errors #document diff --git a/libgo/go/html/testdata/webkit/entities02.dat b/libgo/go/html/testdata/webkit/entities02.dat index 0b4dd66..e2fb42a 100644 --- a/libgo/go/html/testdata/webkit/entities02.dat +++ b/libgo/go/html/testdata/webkit/entities02.dat @@ -127,3 +127,123 @@ | <body> | <div> | bar="ZZ>" + +#data +<div bar="ZZ£_id=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ£_id=23" + +#data +<div bar="ZZ&prod_id=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ&prod_id=23" + +#data +<div bar="ZZ£_id=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ£_id=23" + +#data +<div bar="ZZ∏_id=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ∏_id=23" + +#data +<div bar="ZZ£=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ£=23" + +#data +<div bar="ZZ&prod=23"></div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| bar="ZZ&prod=23" + +#data +<div>ZZ£_id=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ£_id=23" + +#data +<div>ZZ&prod_id=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ&prod_id=23" + +#data +<div>ZZ£_id=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ£_id=23" + +#data +<div>ZZ∏_id=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ∏_id=23" + +#data +<div>ZZ£=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ£=23" + +#data +<div>ZZ&prod=23</div> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| "ZZ&prod=23" diff --git a/libgo/go/html/testdata/webkit/tests1.dat b/libgo/go/html/testdata/webkit/tests1.dat index ad58d31..cbf8bdd 100644 --- a/libgo/go/html/testdata/webkit/tests1.dat +++ b/libgo/go/html/testdata/webkit/tests1.dat @@ -259,7 +259,7 @@ Line: 1 Col: 24 End tag (a) violates step 1, paragraph 1 of the adoption agency | "Z" #data -<b><button></b></button></b> +<b><button>foo</b>bar #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 15 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. @@ -268,7 +268,23 @@ Line: 1 Col: 15 End tag (b) violates step 1, paragraph 1 of the adoption agency | <head> | <body> | <b> +| <button> +| <b> +| "foo" +| "bar" + +#data +<!DOCTYPE html><span><button>foo</span>bar +#errors +39: End tag “span” seen but there were unclosed elements. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <span> | <button> +| "foobar" #data <p><b><div><marquee></p></b></div>X @@ -818,32 +834,6 @@ Line: 1 Col: 22 Expected closing tag. Unexpected end of file. | "D" #data -<cite><b><cite><i><cite><i><cite><i><div>X</b>TEST -#errors -Line: 1 Col: 6 Unexpected start tag (cite). Expected DOCTYPE. -Line: 1 Col: 46 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. -Line: 1 Col: 50 Expected closing tag. Unexpected end of file. -#document -| <html> -| <head> -| <body> -| <cite> -| <b> -| <cite> -| <i> -| <cite> -| <i> -| <cite> -| <i> -| <i> -| <i> -| <i> -| <div> -| <b> -| "X" -| "TEST" - -#data #errors Line: 1 Col: 0 Unexpected End of file. Expected DOCTYPE. @@ -1246,6 +1236,18 @@ Line: 1 Col: 49 Unexpected end tag (code). Ignored. | <strike> #data +<!DOCTYPE html><spacer>foo +#errors +26: End of file seen and there were open elements. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <spacer> +| "foo" + +#data <title><meta></title><link><title><meta></title> #errors Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. @@ -1474,7 +1476,8 @@ Line: 1 Col: 15 End tag (b) violates step 1, paragraph 1 of the adoption agency | <head> | <body> | <b> -| <button> +| <button> +| <b> #data <p><b><div><marquee></p></b></div> diff --git a/libgo/go/html/testdata/webkit/tests10.dat b/libgo/go/html/testdata/webkit/tests10.dat index 877c9a3..4f8df86 100644 --- a/libgo/go/html/testdata/webkit/tests10.dat +++ b/libgo/go/html/testdata/webkit/tests10.dat @@ -9,6 +9,18 @@ | <svg svg> #data +<!DOCTYPE html><svg></svg><![CDATA[a]]> +#errors +29: Bogus comment +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <svg svg> +| <!-- [CDATA[a]] --> + +#data <!DOCTYPE html><body><svg></svg> #errors #document @@ -428,3 +440,360 @@ | xlink href="foo" | xml lang="en" | "bar" + +#data +<svg></path> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> + +#data +<div><svg></div>a +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| "a" + +#data +<div><svg><path></div>a +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg path> +| "a" + +#data +<div><svg><path></svg><path> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg path> +| <path> + +#data +<div><svg><path><foreignObject><math></div>a +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg path> +| <svg foreignObject> +| <math math> +| "a" + +#data +<div><svg><path><foreignObject><p></div>a +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg path> +| <svg foreignObject> +| <p> +| "a" + +#data +<!DOCTYPE html><svg><desc><div><svg><ul>a +#errors +40: HTML start tag “ul” in a foreign namespace context. +41: End of file in a foreign namespace context. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <svg svg> +| <svg desc> +| <div> +| <svg svg> +| <ul> +| "a" + +#data +<!DOCTYPE html><svg><desc><svg><ul>a +#errors +35: HTML start tag “ul” in a foreign namespace context. +36: End of file in a foreign namespace context. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <svg svg> +| <svg desc> +| <svg svg> +| <ul> +| "a" + +#data +<!DOCTYPE html><p><svg><desc><p> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <svg svg> +| <svg desc> +| <p> + +#data +<!DOCTYPE html><p><svg><title><p> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <svg svg> +| <svg title> +| <p> + +#data +<div><svg><path><foreignObject><p></foreignObject><p> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg path> +| <svg foreignObject> +| <p> +| <p> + +#data +<math><mi><div><object><div><span></span></div></object></div></mi><mi> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mi> +| <div> +| <object> +| <div> +| <span> +| <math mi> + +#data +<math><mi><svg><foreignObject><div><div></div></div></foreignObject></svg></mi><mi> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mi> +| <svg svg> +| <svg foreignObject> +| <div> +| <div> +| <math mi> + +#data +<svg><script></script><path> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <svg script> +| <svg path> + +#data +<table><svg></svg><tr> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <table> +| <tbody> +| <tr> + +#data +<math><mi><mglyph> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mi> +| <math mglyph> + +#data +<math><mi><malignmark> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mi> +| <math malignmark> + +#data +<math><mo><mglyph> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mo> +| <math mglyph> + +#data +<math><mo><malignmark> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mo> +| <math malignmark> + +#data +<math><mn><mglyph> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mn> +| <math mglyph> + +#data +<math><mn><malignmark> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mn> +| <math malignmark> + +#data +<math><ms><mglyph> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math ms> +| <math mglyph> + +#data +<math><ms><malignmark> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math ms> +| <math malignmark> + +#data +<math><mtext><mglyph> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mtext> +| <math mglyph> + +#data +<math><mtext><malignmark> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mtext> +| <math malignmark> + +#data +<math><annotation-xml><svg></svg></annotation-xml><mi> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math annotation-xml> +| <svg svg> +| <math mi> + +#data +<math><annotation-xml><svg><foreignObject><div><math><mi></mi></math><span></span></div></foreignObject><path></path></svg></annotation-xml><mi> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math annotation-xml> +| <svg svg> +| <svg foreignObject> +| <div> +| <math math> +| <math mi> +| <span> +| <svg path> +| <math mi> + +#data +<math><annotation-xml><svg><foreignObject><math><mi><svg></svg></mi><mo></mo></math><span></span></foreignObject><path></path></svg></annotation-xml><mi> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math annotation-xml> +| <svg svg> +| <svg foreignObject> +| <math math> +| <math mi> +| <svg svg> +| <math mo> +| <span> +| <svg path> +| <math mi> diff --git a/libgo/go/html/testdata/webkit/tests13.dat b/libgo/go/html/testdata/webkit/tests13.dat deleted file mode 100644 index d180e8e..0000000 --- a/libgo/go/html/testdata/webkit/tests13.dat +++ /dev/null @@ -1,9 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> -<html><head> -<title>404 Not Found</title> -</head><body> -<h1>Not Found</h1> -<p>The requested URL /html5lib-tests/data/tests13.dat was not found on this server.</p> -<p>Additionally, a 404 Not Found -error was encountered while trying to use an ErrorDocument to handle the request.</p> -</body></html> diff --git a/libgo/go/html/testdata/webkit/tests14.dat b/libgo/go/html/testdata/webkit/tests14.dat index 72f8015..b8713f8 100644 --- a/libgo/go/html/testdata/webkit/tests14.dat +++ b/libgo/go/html/testdata/webkit/tests14.dat @@ -71,4 +71,4 @@ | <html> | <head> | <body> -| 789="012"
\ No newline at end of file +| 789="012" diff --git a/libgo/go/html/testdata/webkit/tests15.dat b/libgo/go/html/testdata/webkit/tests15.dat index 7f016ca..6ce1c0d 100644 --- a/libgo/go/html/testdata/webkit/tests15.dat +++ b/libgo/go/html/testdata/webkit/tests15.dat @@ -205,4 +205,4 @@ XXX: These errors are wrong, please fix me! | <html> | <head> | <body> -| <object>
\ No newline at end of file +| <object> diff --git a/libgo/go/html/testdata/webkit/tests2.dat b/libgo/go/html/testdata/webkit/tests2.dat index d33996e..60d8592 100644 --- a/libgo/go/html/testdata/webkit/tests2.dat +++ b/libgo/go/html/testdata/webkit/tests2.dat @@ -461,6 +461,19 @@ Line: 1 Col: 51 Expected closing tag. Unexpected end of file. | <optgroup> #data +<!DOCTYPE html><datalist><option>foo</datalist>bar +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <datalist> +| <option> +| "foo" +| "bar" + +#data <!DOCTYPE html><font><input><input></font> #errors #document @@ -515,7 +528,7 @@ Line: 1 Col: 23 Unexpected start tag isindex. Don't use it! | <form> | <hr> | <label> -| "This is a searchable index. Insert your search keywords here: " +| "This is a searchable index. Enter search keywords: " | <input> | name="isindex" | test="x" @@ -736,3 +749,15 @@ Line: 1 Col: 35 Unexpected character in comment found. | ">" | <!-- <!--x --> | "-->" + +#data +<!doctype html><div><form></form><div></div></div> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <div> +| <form> +| <div> diff --git a/libgo/go/html/testdata/webkit/tests3.dat b/libgo/go/html/testdata/webkit/tests3.dat index b0781a8..38dc501 100644 --- a/libgo/go/html/testdata/webkit/tests3.dat +++ b/libgo/go/html/testdata/webkit/tests3.dat @@ -144,6 +144,18 @@ Line: 2 Col: 7 End tag (pre) seen too early. Expected other end tag. y" #data +<!DOCTYPE html><pre>

A</pre> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <pre> +| " +A" + +#data <!DOCTYPE html><HTML><META><HEAD></HEAD></HTML> #errors Line: 1 Col: 33 Unexpected start tag head in existing head. Ignored. diff --git a/libgo/go/html/testdata/webkit/tests6.dat b/libgo/go/html/testdata/webkit/tests6.dat index 2fb7996..f28ece4 100644 --- a/libgo/go/html/testdata/webkit/tests6.dat +++ b/libgo/go/html/testdata/webkit/tests6.dat @@ -631,6 +631,16 @@ Line: 1 Col: 17 Unexpected start tag (frameset). | <frameset> #data +<track><frameset></frameset> +#errors +Line: 1 Col: 7 Unexpected start tag (track). Expected DOCTYPE. +Line: 1 Col: 17 Unexpected start tag (frameset). +#document +| <html> +| <head> +| <frameset> + +#data </html><frameset></frameset> #errors 7: End tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. diff --git a/libgo/go/html/testdata/webkit/tests9.dat b/libgo/go/html/testdata/webkit/tests9.dat index 2b715f8..554e27a 100644 --- a/libgo/go/html/testdata/webkit/tests9.dat +++ b/libgo/go/html/testdata/webkit/tests9.dat @@ -19,6 +19,33 @@ | <math math> #data +<!DOCTYPE html><math><mi> +#errors +25: End of file in a foreign namespace context. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <math math> +| <math mi> + +#data +<!DOCTYPE html><math><annotation-xml><svg><u> +#errors +45: HTML start tag “u” in a foreign namespace context. +45: End of file seen and there were open elements. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <math math> +| <math annotation-xml> +| <svg svg> +| <u> + +#data <!DOCTYPE html><body><select><math></math></select> #errors Line: 1 Col: 35 Unexpected start tag token (math) in the select phase. Ignored. diff --git a/libgo/go/html/testdata/webkit/webkit01.dat b/libgo/go/html/testdata/webkit/webkit01.dat index 544da9e..4101b21 100644 --- a/libgo/go/html/testdata/webkit/webkit01.dat +++ b/libgo/go/html/testdata/webkit/webkit01.dat @@ -129,35 +129,6 @@ console.log("FOO<span>BAR</span>BAZ"); | <potato> #data -1<script>document.write("2")</script>3 -#errors -#document -| <html> -| <head> -| <body> -| "1" -| <script> -| "document.write("2")" -| "23" - -#data -1<script>document.write("<script>document.write('2')</scr"+ "ipt><script>document.write('3')</scr" + "ipt>")</script>4 -#errors -#document -| <html> -| <head> -| <body> -| "1" -| <script> -| "document.write("<script>document.write('2')</scr"+ "ipt><script>document.write('3')</scr" + "ipt>")" -| <script> -| "document.write('2')" -| "2" -| <script> -| "document.write('3')" -| "34" - -#data </ tttt> #errors #document @@ -186,8 +157,7 @@ console.log("FOO<span>BAR</span>BAZ"); | <head> | <body> | <p> -| "Test" -| "Test2" +| "TestTest2" #data <rdar://problem/6869687> @@ -209,3 +179,431 @@ console.log("FOO<span>BAR</span>BAZ"); | <body> | <a> | "test< /A>" + +#data +< +#errors +#document +| <html> +| <head> +| <body> +| "<" + +#data +<body foo='bar'><body foo='baz' yo='mama'> +#errors +#document +| <html> +| <head> +| <body> +| foo="bar" +| yo="mama" + +#data +<body></br foo="bar"></body> +#errors +#document +| <html> +| <head> +| <body> +| <br> + +#data +<bdy><br foo="bar"></body> +#errors +#document +| <html> +| <head> +| <body> +| <bdy> +| <br> +| foo="bar" + +#data +<body></body></br foo="bar"> +#errors +#document +| <html> +| <head> +| <body> +| <br> + +#data +<bdy></body><br foo="bar"> +#errors +#document +| <html> +| <head> +| <body> +| <bdy> +| <br> +| foo="bar" + +#data +<html><body></body></html><!-- Hi there --> +#errors +#document +| <html> +| <head> +| <body> +| <!-- Hi there --> + +#data +<html><body></body></html>x<!-- Hi there --> +#errors +#document +| <html> +| <head> +| <body> +| "x" +| <!-- Hi there --> + +#data +<html><body></body></html>x<!-- Hi there --></html><!-- Again --> +#errors +#document +| <html> +| <head> +| <body> +| "x" +| <!-- Hi there --> +| <!-- Again --> + +#data +<html><body></body></html>x<!-- Hi there --></body></html><!-- Again --> +#errors +#document +| <html> +| <head> +| <body> +| "x" +| <!-- Hi there --> +| <!-- Again --> + +#data +<html><body><ruby><div><rp>xx</rp></div></ruby></body></html> +#errors +#document +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <rp> +| "xx" + +#data +<html><body><ruby><div><rt>xx</rt></div></ruby></body></html> +#errors +#document +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <rt> +| "xx" + +#data +<html><frameset><!--1--><noframes>A</noframes><!--2--></frameset><!--3--><noframes>B</noframes><!--4--></html><!--5--><noframes>C</noframes><!--6--> +#errors +#document +| <html> +| <head> +| <frameset> +| <!-- 1 --> +| <noframes> +| "A" +| <!-- 2 --> +| <!-- 3 --> +| <noframes> +| "B" +| <!-- 4 --> +| <noframes> +| "C" +| <!-- 5 --> +| <!-- 6 --> + +#data +<select><option>A<select><option>B<select><option>C<select><option>D<select><option>E<select><option>F<select><option>G<select> +#errors +#document +| <html> +| <head> +| <body> +| <select> +| <option> +| "A" +| <option> +| "B" +| <select> +| <option> +| "C" +| <option> +| "D" +| <select> +| <option> +| "E" +| <option> +| "F" +| <select> +| <option> +| "G" + +#data +<dd><dd><dt><dt><dd><li><li> +#errors +#document +| <html> +| <head> +| <body> +| <dd> +| <dd> +| <dt> +| <dt> +| <dd> +| <li> +| <li> + +#data +<div><b></div><div><nobr>a<nobr> +#errors +#document +| <html> +| <head> +| <body> +| <div> +| <b> +| <div> +| <b> +| <nobr> +| "a" +| <nobr> + +#data +<head></head> +<body></body> +#errors +#document +| <html> +| <head> +| " +" +| <body> + +#data +<head></head> <style></style>ddd +#errors +#document +| <html> +| <head> +| <style> +| " " +| <body> +| "ddd" + +#data +<kbd><table></kbd><col><select><tr> +#errors +#document +| <html> +| <head> +| <body> +| <kbd> +| <select> +| <table> +| <colgroup> +| <col> +| <tbody> +| <tr> + +#data +<kbd><table></kbd><col><select><tr></table><div> +#errors +#document +| <html> +| <head> +| <body> +| <kbd> +| <select> +| <table> +| <colgroup> +| <col> +| <tbody> +| <tr> +| <div> + +#data +<a><li><style></style><title></title></a> +#errors +#document +| <html> +| <head> +| <body> +| <a> +| <li> +| <a> +| <style> +| <title> + +#data +<font></p><p><meta><title></title></font> +#errors +#document +| <html> +| <head> +| <body> +| <font> +| <p> +| <p> +| <font> +| <meta> +| <title> + +#data +<a><center><title></title><a> +#errors +#document +| <html> +| <head> +| <body> +| <a> +| <center> +| <a> +| <title> +| <a> + +#data +<svg><title><div> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <svg title> +| <div> + +#data +<svg><title><rect><div> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <svg title> +| <rect> +| <div> + +#data +<svg><title><svg><div> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <svg title> +| <svg svg> +| <div> + +#data +<img <="" FAIL> +#errors +#document +| <html> +| <head> +| <body> +| <img> +| <="" +| fail="" + +#data +<ul><li><div id='foo'/>A</li><li>B<div>C</div></li></ul> +#errors +#document +| <html> +| <head> +| <body> +| <ul> +| <li> +| <div> +| id="foo" +| "A" +| <li> +| "B" +| <div> +| "C" + +#data +<svg><em><desc></em> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <em> +| <desc> + +#data +<table><tr><td><svg><desc><td></desc><circle> +#errors +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <svg svg> +| <svg desc> +| <svg circle> + +#data +<svg><tfoot></mi><td> +#errors +#document +| <html> +| <head> +| <body> +| <svg svg> +| <svg tfoot> +| <svg td> + +#data +<math><mrow><mrow><mn>1</mn></mrow><mi>a</mi></mrow></math> +#errors +#document +| <html> +| <head> +| <body> +| <math math> +| <math mrow> +| <math mrow> +| <math mn> +| "1" +| <math mi> +| "a" + +#data +<!doctype html><input type="hidden"><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!doctype html><input type="button"><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <input> +| type="button" diff --git a/libgo/go/html/token.go b/libgo/go/html/token.go index ad03241..d266b3a 100644 --- a/libgo/go/html/token.go +++ b/libgo/go/html/token.go @@ -27,6 +27,8 @@ const ( SelfClosingTagToken // A CommentToken looks like <!--x-->. CommentToken + // A DoctypeToken looks like <!DOCTYPE x> + DoctypeToken ) // String returns a string representation of the TokenType. @@ -44,6 +46,8 @@ func (t TokenType) String() string { return "SelfClosingTag" case CommentToken: return "Comment" + case DoctypeToken: + return "Doctype" } return "Invalid(" + strconv.Itoa(int(t)) + ")" } @@ -56,9 +60,9 @@ type Attribute struct { } // A Token consists of a TokenType and some Data (tag name for start and end -// tags, content for text and comments). A tag Token may also contain a slice -// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather -// than "a<b"). +// tags, content for text, comments and doctypes). A tag Token may also contain +// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" +// rather than "a<b"). type Token struct { Type TokenType Data string @@ -97,6 +101,8 @@ func (t Token) String() string { return "<" + t.tagString() + "/>" case CommentToken: return "<!--" + EscapeString(t.Data) + "-->" + case DoctypeToken: + return "<!DOCTYPE " + EscapeString(t.Data) + ">" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" } @@ -109,9 +115,15 @@ type Tokenizer struct { // r is the source of the HTML text. r io.Reader - // tt is the TokenType of the most recently read token. If tt == Error - // then err is the error associated with trying to read that token. - tt TokenType + // tt is the TokenType of the most recently read token. + tt TokenType + // err is the first error encountered during tokenization. It is possible + // for tt != Error && err != nil to hold: this means that Next returned a + // valid token but the subsequent Next call will return an error token. + // For example, if the HTML text input was just "plain", then the first + // Next call would set z.err to os.EOF but return a TextToken, and all + // subsequent Next calls would return an ErrorToken. + // err is never reset. Once it becomes non-nil, it stays non-nil. err os.Error // buf[p0:p1] holds the raw data of the most recent token. // buf[p1:] is buffered input that will yield future tokens. @@ -137,7 +149,9 @@ func (z *Tokenizer) Raw() []byte { // readByte returns the next byte from the input stream, doing a buffered read // from z.r into z.buf if necessary. z.buf[z.p0:z.p1] remains a contiguous byte // slice that holds all the bytes read so far for the current token. -func (z *Tokenizer) readByte() (byte, os.Error) { +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { if z.p1 >= len(z.buf) { // Our buffer is exhausted and we have to read from z.r. // We copy z.buf[z.p0:z.p1] to the beginning of z.buf. If the length @@ -149,139 +163,168 @@ func (z *Tokenizer) readByte() (byte, os.Error) { if 2*d > c { buf1 = make([]byte, d, 2*c) } else { - buf1 = z.buf[0:d] + buf1 = z.buf[:d] } copy(buf1, z.buf[z.p0:z.p1]) - z.p0, z.p1, z.buf = 0, d, buf1[0:d] + z.p0, z.p1, z.buf = 0, d, buf1[:d] // Now that we have copied the live bytes to the start of the buffer, // we read from z.r into the remainder. n, err := z.r.Read(buf1[d:cap(buf1)]) if err != nil { - return 0, err + z.err = err + return 0 } - z.buf = buf1[0 : d+n] + z.buf = buf1[:d+n] } x := z.buf[z.p1] z.p1++ - return x, nil + return x } -// readTo keeps reading bytes until x is found. -func (z *Tokenizer) readTo(x uint8) os.Error { +// readTo keeps reading bytes until x is found or a read error occurs. If an +// error does occur, z.err is set to that error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readTo(x uint8) { for { - c, err := z.readByte() - if err != nil { - return err + c := z.readByte() + if z.err != nil { + return } switch c { case x: - return nil + return case '\\': - _, err = z.readByte() - if err != nil { - return err + z.readByte() + if z.err != nil { + return } } } - panic("unreachable") } -// nextMarkupDeclaration returns the next TokenType starting with "<!". -func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) { - // TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment. - for i := 0; i < 2; i++ { - c, err := z.readByte() - if err != nil { - return TextToken, err - } - if c != '-' { - return z.nextText(), nil - } - } +// nextComment reads the next token starting with "<!--". +// The opening "<!--" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 4 <= z.p1. +func (z *Tokenizer) nextComment() { // <!--> is a valid comment. for dashCount := 2; ; { - c, err := z.readByte() - if err != nil { - return TextToken, err + c := z.readByte() + if z.err != nil { + return } switch c { case '-': dashCount++ case '>': if dashCount >= 2 { - return CommentToken, nil + z.tt = CommentToken + return } - fallthrough + dashCount = 0 default: dashCount = 0 } } - panic("unreachable") } -// nextTag returns the next TokenType starting from the tag open state. -func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { - c, err := z.readByte() - if err != nil { - return ErrorToken, err +// nextMarkupDeclaration reads the next token starting with "<!". +// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text". +// The opening "<!" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 2 <= z.p1. +func (z *Tokenizer) nextMarkupDeclaration() { + var c [2]byte + for i := 0; i < 2; i++ { + c[i] = z.readByte() + if z.err != nil { + return + } + } + if c[0] == '-' && c[1] == '-' { + z.nextComment() + return + } + z.p1 -= 2 + const s = "DOCTYPE " + for i := 0; ; i++ { + c := z.readByte() + if z.err != nil { + return + } + // Capitalize c. + if 'a' <= c && c <= 'z' { + c = 'A' + (c - 'a') + } + if i < len(s) && c != s[i] { + z.nextText() + return + } + if c == '>' { + if i >= len(s) { + z.tt = DoctypeToken + } + return + } + } +} + +// nextTag reads the next token starting with "<". It might be a "<startTag>", +// an "</endTag>", a "<!markup declaration>", or "<malformed text". +// The opening "<" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. +func (z *Tokenizer) nextTag() { + c := z.readByte() + if z.err != nil { + return } switch { case c == '/': - tt = EndTagToken + z.tt = EndTagToken // Lower-cased characters are more common in tag names, so we check for them first. case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': - tt = StartTagToken + z.tt = StartTagToken case c == '!': - return z.nextMarkupDeclaration() + z.nextMarkupDeclaration() + return case c == '?': - return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions") + z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions") + return default: - return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags") + z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") + return } for { - c, err := z.readByte() - if err != nil { - return TextToken, err + c := z.readByte() + if z.err != nil { + return } switch c { - case '"': - err = z.readTo('"') - if err != nil { - return TextToken, err - } - case '\'': - err = z.readTo('\'') - if err != nil { - return TextToken, err + case '"', '\'': + z.readTo(c) + if z.err != nil { + return } case '>': - if z.buf[z.p1-2] == '/' && tt == StartTagToken { - return SelfClosingTagToken, nil + if z.buf[z.p1-2] == '/' && z.tt == StartTagToken { + z.tt = SelfClosingTagToken } - return tt, nil + return } } - panic("unreachable") } // nextText reads all text up until an '<'. -func (z *Tokenizer) nextText() TokenType { +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. +func (z *Tokenizer) nextText() { for { - c, err := z.readByte() - if err != nil { - z.tt, z.err = ErrorToken, err - if err == os.EOF { - z.tt = TextToken - } - return z.tt + c := z.readByte() + if z.err != nil { + return } if c == '<' { z.p1-- - z.tt = TextToken - return z.tt + return } } - panic("unreachable") } // Next scans the next token and returns its type. @@ -292,19 +335,22 @@ func (z *Tokenizer) Next() TokenType { return z.tt } z.p0 = z.p1 - c, err := z.readByte() - if err != nil { - z.tt, z.err = ErrorToken, err + c := z.readByte() + if z.err != nil { + z.tt = ErrorToken return z.tt } - if c == '<' { - z.tt, z.err = z.nextTag() + // We assume that the next token is text unless proven otherwise. + z.tt = TextToken + if c != '<' { + z.nextText() + } else { + z.nextTag() if z.tt == CommentToken && !z.ReturnComments { continue } - return z.tt } - return z.nextText() + return z.tt } panic("unreachable") } @@ -331,20 +377,65 @@ func (z *Tokenizer) trim(i int) int { return k } -// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:] -// and returns that word lower-cased, as well as the trimmed cursor location -// after that word. -func (z *Tokenizer) lower(i int) ([]byte, int) { +// tagName finds the tag name at the start of z.buf[i:] and returns that name +// lower-cased, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) tagName(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + c := z.buf[i] + switch c { + case ' ', '\n', '\t', '\f', '/', '>': + break loop + } + if 'A' <= c && c <= 'Z' { + z.buf[i] = c + 'a' - 'A' + } + } + return z.buf[i0:i], z.trim(i) +} + +// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:] +// and returns that value, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + switch z.buf[i] { + case ' ', '\n', '\t', '\f', '>': + break loop + case '&': + // TODO: unescape the entity. + } + } + return z.buf[i0:i], z.trim(i) +} + +// attrName finds the largest attribute name at the start +// of z.buf[i:] and returns it lower-cased, as well +// as the trimmed cursor location after that name. +// +// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name +// TODO: unicode characters +func (z *Tokenizer) attrName(i int) ([]byte, int) { + for z.buf[i] == '/' { + i++ + if z.buf[i] == '>' { + return nil, z.trim(i) + } + } i0 := i loop: for ; i < z.p1; i++ { c := z.buf[i] + switch c { + case '>', '/', '=': + break loop + } switch { - case '0' <= c && c <= '9': - // No-op. case 'A' <= c && c <= 'Z': z.buf[i] = c + 'a' - 'A' - case 'a' <= c && c <= 'z': + case c > ' ' && c < 0x7f: // No-op. default: break loop @@ -353,25 +444,29 @@ loop: return z.buf[i0:i], z.trim(i) } -// Text returns the unescaped text of a TextToken or a CommentToken. -// The contents of the returned slice may change on the next call to Next. +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { + var i0, i1 int switch z.tt { case TextToken: - s := unescape(z.Raw()) - z.p0 = z.p1 - return s + i0 = z.p0 + i1 = z.p1 case CommentToken: - // We trim the "<!--" from the left and the "-->" from the right. + // Trim the "<!--" from the left and the "-->" from the right. // "<!-->" is a valid comment, so the adjusted endpoints might overlap. - i0 := z.p0 + 4 - i1 := z.p1 - 3 - z.p0 = z.p1 - var s []byte - if i0 < i1 { - s = unescape(z.buf[i0:i1]) - } - return s + i0 = z.p0 + 4 + i1 = z.p1 - 3 + case DoctypeToken: + // Trim the "<!DOCTYPE " from the left and the ">" from the right. + i0 = z.p0 + 10 + i1 = z.p1 - 1 + default: + return nil + } + z.p0 = z.p1 + if i0 < i1 { + return unescape(z.buf[i0:i1]) } return nil } @@ -388,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { if z.buf[i] == '/' { i++ } - name, z.p0 = z.lower(i) + name, z.p0 = z.tagName(i) hasAttr = z.p0 != z.p1 return } @@ -397,27 +492,40 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { // attribute for the current tag token and whether there are more attributes. // The contents of the returned slices may change on the next call to Next. func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { - key, i := z.lower(z.p0) - // Get past the "=\"". - if i == z.p1 || z.buf[i] != '=' { + key, i := z.attrName(z.p0) + // Check for an empty attribute value. + if i == z.p1 { + z.p0 = i + return + } + // Get past the equals and quote characters. + if z.buf[i] != '=' { + z.p0, moreAttr = i, true return } i = z.trim(i + 1) - if i == z.p1 || z.buf[i] != '"' { + if i == z.p1 { + z.p0 = i + return + } + closeQuote := z.buf[i] + if closeQuote != '\'' && closeQuote != '"' { + val, z.p0 = z.unquotedAttrVal(i) + moreAttr = z.p0 != z.p1 return } i = z.trim(i + 1) - // Copy and unescape everything up to the closing '"'. + // Copy and unescape everything up to the closing quote. dst, src := i, i loop: for src < z.p1 { c := z.buf[src] switch c { - case '"': + case closeQuote: src++ break loop case '&': - dst, src = unescapeEntity(z.buf, dst, src) + dst, src = unescapeEntity(z.buf, dst, src, true) case '\\': if src == z.p1 { z.buf[dst] = '\\' @@ -441,7 +549,7 @@ loop: func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { - case TextToken, CommentToken: + case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, EndTagToken, SelfClosingTagToken: var attr []Attribute diff --git a/libgo/go/html/token_test.go b/libgo/go/html/token_test.go index 5cf1f6d..0a0beb2 100644 --- a/libgo/go/html/token_test.go +++ b/libgo/go/html/token_test.go @@ -41,6 +41,32 @@ var tokenTests = []tokenTest{ "<a>b<c/>d</e>", "<a>$b$<c/>$d$</e>", }, + // Some malformed tags that are missing a '>'. + { + "malformed tag #0", + `<p</p>`, + `<p< p="">`, + }, + { + "malformed tag #1", + `<p </p>`, + `<p <="" p="">`, + }, + { + "malformed tag #2", + `<p id=0</p>`, + `<p id="0</p">`, + }, + { + "malformed tag #3", + `<p id="0</p>`, + `<p id="0</p>">`, + }, + { + "malformed tag #4", + `<p id="0"</p>`, + `<p id="0" <="" p="">`, + }, // Comments. { "comment0", @@ -100,20 +126,77 @@ var tokenTests = []tokenTest{ "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, }, - // A non-existant entity. Tokenizing and converting back to a string should + // A nonexistent entity. Tokenizing and converting back to a string should // escape the "&" to become "&". { "noSuchEntity", `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, }, + { + "entity without semicolon", + `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, + `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, + }, + { + "entity with digits", + "½", + "½", + }, + // Attribute tests: + // http://dev.w3.org/html5/spec/Overview.html#attributes-0 + { + "Empty attribute", + `<input disabled FOO>`, + `<input disabled="" foo="">`, + }, + { + "Empty attribute, whitespace", + `<input disabled FOO >`, + `<input disabled="" foo="">`, + }, + { + "Unquoted attribute value", + `<input value=yes FOO=BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, spaces", + `<input value = yes FOO = BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, trailing space", + `<input value=yes FOO=BAR >`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value", + `<input value='yes' FOO='BAR'>`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value, trailing space", + `<input value='yes' FOO='BAR' >`, + `<input value="yes" foo="BAR">`, + }, + { + "Double-quoted attribute value", + `<input value="I'm an attribute" FOO="BAR">`, + `<input value="I'm an attribute" foo="BAR">`, + }, + { + "Attribute name characters", + `<meta http-equiv="content-type">`, + `<meta http-equiv="content-type">`, + }, } func TestTokenizer(t *testing.T) { loop: for _, tt := range tokenTests { z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) - for i, s := range strings.Split(tt.golden, "$", -1) { + for i, s := range strings.Split(tt.golden, "$") { if z.Next() == ErrorToken { t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) continue loop |