diff options
Diffstat (limited to 'libgo/go/html/token.go')
-rw-r--r-- | libgo/go/html/token.go | 336 |
1 files changed, 222 insertions, 114 deletions
diff --git a/libgo/go/html/token.go b/libgo/go/html/token.go index ad03241..d266b3a 100644 --- a/libgo/go/html/token.go +++ b/libgo/go/html/token.go @@ -27,6 +27,8 @@ const ( SelfClosingTagToken // A CommentToken looks like <!--x-->. CommentToken + // A DoctypeToken looks like <!DOCTYPE x> + DoctypeToken ) // String returns a string representation of the TokenType. @@ -44,6 +46,8 @@ func (t TokenType) String() string { return "SelfClosingTag" case CommentToken: return "Comment" + case DoctypeToken: + return "Doctype" } return "Invalid(" + strconv.Itoa(int(t)) + ")" } @@ -56,9 +60,9 @@ type Attribute struct { } // A Token consists of a TokenType and some Data (tag name for start and end -// tags, content for text and comments). A tag Token may also contain a slice -// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather -// than "a<b"). +// tags, content for text, comments and doctypes). A tag Token may also contain +// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" +// rather than "a<b"). type Token struct { Type TokenType Data string @@ -97,6 +101,8 @@ func (t Token) String() string { return "<" + t.tagString() + "/>" case CommentToken: return "<!--" + EscapeString(t.Data) + "-->" + case DoctypeToken: + return "<!DOCTYPE " + EscapeString(t.Data) + ">" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" } @@ -109,9 +115,15 @@ type Tokenizer struct { // r is the source of the HTML text. r io.Reader - // tt is the TokenType of the most recently read token. If tt == Error - // then err is the error associated with trying to read that token. - tt TokenType + // tt is the TokenType of the most recently read token. + tt TokenType + // err is the first error encountered during tokenization. It is possible + // for tt != Error && err != nil to hold: this means that Next returned a + // valid token but the subsequent Next call will return an error token. + // For example, if the HTML text input was just "plain", then the first + // Next call would set z.err to os.EOF but return a TextToken, and all + // subsequent Next calls would return an ErrorToken. + // err is never reset. Once it becomes non-nil, it stays non-nil. err os.Error // buf[p0:p1] holds the raw data of the most recent token. // buf[p1:] is buffered input that will yield future tokens. @@ -137,7 +149,9 @@ func (z *Tokenizer) Raw() []byte { // readByte returns the next byte from the input stream, doing a buffered read // from z.r into z.buf if necessary. z.buf[z.p0:z.p1] remains a contiguous byte // slice that holds all the bytes read so far for the current token. -func (z *Tokenizer) readByte() (byte, os.Error) { +// It sets z.err if the underlying reader returns an error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readByte() byte { if z.p1 >= len(z.buf) { // Our buffer is exhausted and we have to read from z.r. // We copy z.buf[z.p0:z.p1] to the beginning of z.buf. If the length @@ -149,139 +163,168 @@ func (z *Tokenizer) readByte() (byte, os.Error) { if 2*d > c { buf1 = make([]byte, d, 2*c) } else { - buf1 = z.buf[0:d] + buf1 = z.buf[:d] } copy(buf1, z.buf[z.p0:z.p1]) - z.p0, z.p1, z.buf = 0, d, buf1[0:d] + z.p0, z.p1, z.buf = 0, d, buf1[:d] // Now that we have copied the live bytes to the start of the buffer, // we read from z.r into the remainder. n, err := z.r.Read(buf1[d:cap(buf1)]) if err != nil { - return 0, err + z.err = err + return 0 } - z.buf = buf1[0 : d+n] + z.buf = buf1[:d+n] } x := z.buf[z.p1] z.p1++ - return x, nil + return x } -// readTo keeps reading bytes until x is found. -func (z *Tokenizer) readTo(x uint8) os.Error { +// readTo keeps reading bytes until x is found or a read error occurs. If an +// error does occur, z.err is set to that error. +// Pre-condition: z.err == nil. +func (z *Tokenizer) readTo(x uint8) { for { - c, err := z.readByte() - if err != nil { - return err + c := z.readByte() + if z.err != nil { + return } switch c { case x: - return nil + return case '\\': - _, err = z.readByte() - if err != nil { - return err + z.readByte() + if z.err != nil { + return } } } - panic("unreachable") } -// nextMarkupDeclaration returns the next TokenType starting with "<!". -func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) { - // TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment. - for i := 0; i < 2; i++ { - c, err := z.readByte() - if err != nil { - return TextToken, err - } - if c != '-' { - return z.nextText(), nil - } - } +// nextComment reads the next token starting with "<!--". +// The opening "<!--" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 4 <= z.p1. +func (z *Tokenizer) nextComment() { // <!--> is a valid comment. for dashCount := 2; ; { - c, err := z.readByte() - if err != nil { - return TextToken, err + c := z.readByte() + if z.err != nil { + return } switch c { case '-': dashCount++ case '>': if dashCount >= 2 { - return CommentToken, nil + z.tt = CommentToken + return } - fallthrough + dashCount = 0 default: dashCount = 0 } } - panic("unreachable") } -// nextTag returns the next TokenType starting from the tag open state. -func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { - c, err := z.readByte() - if err != nil { - return ErrorToken, err +// nextMarkupDeclaration reads the next token starting with "<!". +// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text". +// The opening "<!" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 2 <= z.p1. +func (z *Tokenizer) nextMarkupDeclaration() { + var c [2]byte + for i := 0; i < 2; i++ { + c[i] = z.readByte() + if z.err != nil { + return + } + } + if c[0] == '-' && c[1] == '-' { + z.nextComment() + return + } + z.p1 -= 2 + const s = "DOCTYPE " + for i := 0; ; i++ { + c := z.readByte() + if z.err != nil { + return + } + // Capitalize c. + if 'a' <= c && c <= 'z' { + c = 'A' + (c - 'a') + } + if i < len(s) && c != s[i] { + z.nextText() + return + } + if c == '>' { + if i >= len(s) { + z.tt = DoctypeToken + } + return + } + } +} + +// nextTag reads the next token starting with "<". It might be a "<startTag>", +// an "</endTag>", a "<!markup declaration>", or "<malformed text". +// The opening "<" has already been consumed. +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. +func (z *Tokenizer) nextTag() { + c := z.readByte() + if z.err != nil { + return } switch { case c == '/': - tt = EndTagToken + z.tt = EndTagToken // Lower-cased characters are more common in tag names, so we check for them first. case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': - tt = StartTagToken + z.tt = StartTagToken case c == '!': - return z.nextMarkupDeclaration() + z.nextMarkupDeclaration() + return case c == '?': - return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions") + z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions") + return default: - return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags") + z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") + return } for { - c, err := z.readByte() - if err != nil { - return TextToken, err + c := z.readByte() + if z.err != nil { + return } switch c { - case '"': - err = z.readTo('"') - if err != nil { - return TextToken, err - } - case '\'': - err = z.readTo('\'') - if err != nil { - return TextToken, err + case '"', '\'': + z.readTo(c) + if z.err != nil { + return } case '>': - if z.buf[z.p1-2] == '/' && tt == StartTagToken { - return SelfClosingTagToken, nil + if z.buf[z.p1-2] == '/' && z.tt == StartTagToken { + z.tt = SelfClosingTagToken } - return tt, nil + return } } - panic("unreachable") } // nextText reads all text up until an '<'. -func (z *Tokenizer) nextText() TokenType { +// Pre-condition: z.tt == TextToken && z.err == nil && z.p0 + 1 <= z.p1. +func (z *Tokenizer) nextText() { for { - c, err := z.readByte() - if err != nil { - z.tt, z.err = ErrorToken, err - if err == os.EOF { - z.tt = TextToken - } - return z.tt + c := z.readByte() + if z.err != nil { + return } if c == '<' { z.p1-- - z.tt = TextToken - return z.tt + return } } - panic("unreachable") } // Next scans the next token and returns its type. @@ -292,19 +335,22 @@ func (z *Tokenizer) Next() TokenType { return z.tt } z.p0 = z.p1 - c, err := z.readByte() - if err != nil { - z.tt, z.err = ErrorToken, err + c := z.readByte() + if z.err != nil { + z.tt = ErrorToken return z.tt } - if c == '<' { - z.tt, z.err = z.nextTag() + // We assume that the next token is text unless proven otherwise. + z.tt = TextToken + if c != '<' { + z.nextText() + } else { + z.nextTag() if z.tt == CommentToken && !z.ReturnComments { continue } - return z.tt } - return z.nextText() + return z.tt } panic("unreachable") } @@ -331,20 +377,65 @@ func (z *Tokenizer) trim(i int) int { return k } -// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:] -// and returns that word lower-cased, as well as the trimmed cursor location -// after that word. -func (z *Tokenizer) lower(i int) ([]byte, int) { +// tagName finds the tag name at the start of z.buf[i:] and returns that name +// lower-cased, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) tagName(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + c := z.buf[i] + switch c { + case ' ', '\n', '\t', '\f', '/', '>': + break loop + } + if 'A' <= c && c <= 'Z' { + z.buf[i] = c + 'a' - 'A' + } + } + return z.buf[i0:i], z.trim(i) +} + +// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:] +// and returns that value, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + switch z.buf[i] { + case ' ', '\n', '\t', '\f', '>': + break loop + case '&': + // TODO: unescape the entity. + } + } + return z.buf[i0:i], z.trim(i) +} + +// attrName finds the largest attribute name at the start +// of z.buf[i:] and returns it lower-cased, as well +// as the trimmed cursor location after that name. +// +// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name +// TODO: unicode characters +func (z *Tokenizer) attrName(i int) ([]byte, int) { + for z.buf[i] == '/' { + i++ + if z.buf[i] == '>' { + return nil, z.trim(i) + } + } i0 := i loop: for ; i < z.p1; i++ { c := z.buf[i] + switch c { + case '>', '/', '=': + break loop + } switch { - case '0' <= c && c <= '9': - // No-op. case 'A' <= c && c <= 'Z': z.buf[i] = c + 'a' - 'A' - case 'a' <= c && c <= 'z': + case c > ' ' && c < 0x7f: // No-op. default: break loop @@ -353,25 +444,29 @@ loop: return z.buf[i0:i], z.trim(i) } -// Text returns the unescaped text of a TextToken or a CommentToken. -// The contents of the returned slice may change on the next call to Next. +// Text returns the unescaped text of a text, comment or doctype token. The +// contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { + var i0, i1 int switch z.tt { case TextToken: - s := unescape(z.Raw()) - z.p0 = z.p1 - return s + i0 = z.p0 + i1 = z.p1 case CommentToken: - // We trim the "<!--" from the left and the "-->" from the right. + // Trim the "<!--" from the left and the "-->" from the right. // "<!-->" is a valid comment, so the adjusted endpoints might overlap. - i0 := z.p0 + 4 - i1 := z.p1 - 3 - z.p0 = z.p1 - var s []byte - if i0 < i1 { - s = unescape(z.buf[i0:i1]) - } - return s + i0 = z.p0 + 4 + i1 = z.p1 - 3 + case DoctypeToken: + // Trim the "<!DOCTYPE " from the left and the ">" from the right. + i0 = z.p0 + 10 + i1 = z.p1 - 1 + default: + return nil + } + z.p0 = z.p1 + if i0 < i1 { + return unescape(z.buf[i0:i1]) } return nil } @@ -388,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { if z.buf[i] == '/' { i++ } - name, z.p0 = z.lower(i) + name, z.p0 = z.tagName(i) hasAttr = z.p0 != z.p1 return } @@ -397,27 +492,40 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { // attribute for the current tag token and whether there are more attributes. // The contents of the returned slices may change on the next call to Next. func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { - key, i := z.lower(z.p0) - // Get past the "=\"". - if i == z.p1 || z.buf[i] != '=' { + key, i := z.attrName(z.p0) + // Check for an empty attribute value. + if i == z.p1 { + z.p0 = i + return + } + // Get past the equals and quote characters. + if z.buf[i] != '=' { + z.p0, moreAttr = i, true return } i = z.trim(i + 1) - if i == z.p1 || z.buf[i] != '"' { + if i == z.p1 { + z.p0 = i + return + } + closeQuote := z.buf[i] + if closeQuote != '\'' && closeQuote != '"' { + val, z.p0 = z.unquotedAttrVal(i) + moreAttr = z.p0 != z.p1 return } i = z.trim(i + 1) - // Copy and unescape everything up to the closing '"'. + // Copy and unescape everything up to the closing quote. dst, src := i, i loop: for src < z.p1 { c := z.buf[src] switch c { - case '"': + case closeQuote: src++ break loop case '&': - dst, src = unescapeEntity(z.buf, dst, src) + dst, src = unescapeEntity(z.buf, dst, src, true) case '\\': if src == z.p1 { z.buf[dst] = '\\' @@ -441,7 +549,7 @@ loop: func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { - case TextToken, CommentToken: + case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, EndTagToken, SelfClosingTagToken: var attr []Attribute |