diff options
Diffstat (limited to 'libjava/gnu/xml/aelfred2')
-rw-r--r-- | libjava/gnu/xml/aelfred2/ContentHandler2.java | 65 | ||||
-rw-r--r-- | libjava/gnu/xml/aelfred2/JAXPFactory.java | 196 | ||||
-rw-r--r-- | libjava/gnu/xml/aelfred2/SAXDriver.java | 1385 | ||||
-rw-r--r-- | libjava/gnu/xml/aelfred2/XmlParser.java | 5113 | ||||
-rw-r--r-- | libjava/gnu/xml/aelfred2/XmlReader.java | 315 | ||||
-rw-r--r-- | libjava/gnu/xml/aelfred2/package.html | 506 |
6 files changed, 7580 insertions, 0 deletions
diff --git a/libjava/gnu/xml/aelfred2/ContentHandler2.java b/libjava/gnu/xml/aelfred2/ContentHandler2.java new file mode 100644 index 0000000..7bb1e7d --- /dev/null +++ b/libjava/gnu/xml/aelfred2/ContentHandler2.java @@ -0,0 +1,65 @@ +/* ContentHandler2.java -- + Copyright (C) 2004 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.xml.aelfred2; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Extension to the SAX ContentHandler interface to report parsing events + * and parameters required by DOM Level 3 but not supported by SAX. + * + * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a> + */ +public interface ContentHandler2 + extends ContentHandler +{ + + /** + * Reports the XML declaration. + * @param version the value of the version attribute in the XML + * declaration + * @param encoding the encoding specified in the XML declaration, if any + * @param standalone the standalone attribute from the XML declaration + * @param inputEncoding the encoding of the XML input + */ + void xmlDecl(String version, String encoding, boolean standalone, + String inputEncoding) + throws SAXException; + +} diff --git a/libjava/gnu/xml/aelfred2/JAXPFactory.java b/libjava/gnu/xml/aelfred2/JAXPFactory.java new file mode 100644 index 0000000..006dc13 --- /dev/null +++ b/libjava/gnu/xml/aelfred2/JAXPFactory.java @@ -0,0 +1,196 @@ +/* JAXPFactory.java -- + Copyright (C) 2001 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.xml.aelfred2; + +import java.util.Enumeration; +import java.util.Hashtable; + +import org.xml.sax.Parser; +import org.xml.sax.XMLReader; +import org.xml.sax.SAXException; +import org.xml.sax.SAXNotRecognizedException; +import org.xml.sax.SAXNotSupportedException; +import org.xml.sax.helpers.XMLReaderAdapter; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + + +/** + * Configurable factory to create an Ælfred2 JAXP parser; required + * to bootstrap using JAXP. You should use SAX2 directly where possible, + * rather than through JAXP, since that gives you better control. + * This class would normally be configured as a platform default factory. + * + * @author David Brownell + */ +public final class JAXPFactory extends SAXParserFactory +{ + private Hashtable flags = new Hashtable (); + + /** + * Constructs a factory which normally returns a non-validating + * parser. + */ + public JAXPFactory () { } + + public SAXParser newSAXParser () + throws ParserConfigurationException, SAXException + { + JaxpParser jaxp = new JaxpParser (); + Enumeration e = flags.keys (); + XMLReader parser = jaxp.getXMLReader (); + + parser.setFeature ( + SAXDriver.FEATURE + "namespaces", + isNamespaceAware ()); + parser.setFeature ( + SAXDriver.FEATURE + "validation", + isValidating ()); + // that makes SAX2 feature flags trump JAXP + + while (e.hasMoreElements ()) { + String uri = (String) e.nextElement (); + Boolean value = (Boolean) flags.get (uri); + parser.setFeature (uri, value.booleanValue ()); + } + + return jaxp; + } + + // yes, this "feature transfer" mechanism doesn't play well + + public void setFeature (String name, boolean value) + throws + ParserConfigurationException, + SAXNotRecognizedException, + SAXNotSupportedException + { + try { + // force "early" detection of errors where possible + // (flags can't necessarily be set before parsing) + new JaxpParser ().getXMLReader ().setFeature (name, value); + + flags.put (name, new Boolean (value)); + } catch (SAXNotRecognizedException e) { + throw new SAXNotRecognizedException (name); + } catch (SAXNotSupportedException e) { + throw new SAXNotSupportedException (name); + } catch (Exception e) { + throw new ParserConfigurationException ( + e.getClass ().getName () + + ": " + + e.getMessage ()); + } + } + + public boolean getFeature (String name) + throws + ParserConfigurationException, + SAXNotRecognizedException, + SAXNotSupportedException + { + Boolean value = (Boolean) flags.get (name); + + if (value != null) + return value.booleanValue (); + else + try { + return new JaxpParser ().getXMLReader ().getFeature (name); + } catch (SAXNotRecognizedException e) { + throw new SAXNotRecognizedException (name); + } catch (SAXNotSupportedException e) { + throw new SAXNotSupportedException (name); + } catch (SAXException e) { + throw new ParserConfigurationException ( + e.getClass ().getName () + + ": " + + e.getMessage ()); + } + } + + private static class JaxpParser extends SAXParser + { + private XmlReader ae2 = new XmlReader (); + private XMLReaderAdapter parser = null; + + JaxpParser () { } + + public void setProperty (String id, Object value) + throws SAXNotRecognizedException, SAXNotSupportedException + { ae2.setProperty (id, value); } + + public Object getProperty (String id) + throws SAXNotRecognizedException, SAXNotSupportedException + { return ae2.getProperty (id); } + + public Parser getParser () + throws SAXException + { + if (parser == null) + parser = new XMLReaderAdapter (ae2); + return parser; + } + + public XMLReader getXMLReader () + throws SAXException + { return ae2; } + + public boolean isNamespaceAware () + { + try { + return ae2.getFeature (SAXDriver.FEATURE + "namespaces"); + } catch (Exception e) { + throw new Error (); + } + } + + public boolean isValidating () + { + try { + return ae2.getFeature (SAXDriver.FEATURE + "validation"); + } catch (Exception e) { + throw new Error (); + } + } + + // TODO isXIncludeAware() + + } +} diff --git a/libjava/gnu/xml/aelfred2/SAXDriver.java b/libjava/gnu/xml/aelfred2/SAXDriver.java new file mode 100644 index 0000000..4912f27 --- /dev/null +++ b/libjava/gnu/xml/aelfred2/SAXDriver.java @@ -0,0 +1,1385 @@ +/* SAXDriver.java -- + Copyright (C) 1999,2000,2001,2004 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. + +Portions derived from code which carried the following notice: + + Copyright (c) 1997, 1998 by Microstar Software Ltd. + + AElfred is free for both commercial and non-commercial use and + redistribution, provided that Microstar's copyright and disclaimer are + retained intact. You are free to modify AElfred for your own use and + to redistribute AElfred with your modifications, provided that the + modifications are clearly documented. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + merchantability or fitness for a particular purpose. Please use it AT + YOUR OWN RISK. +*/ + +package gnu.xml.aelfred2; + +import java.io.*; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Locale; +import java.util.Stack; + +// maintaining 1.1 compatibility for now ... more portable, PJava, etc +// Iterator, Hashmap and ArrayList ought to be faster +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.List; +import java.util.Vector; + +import org.xml.sax.*; +import org.xml.sax.ext.*; +import org.xml.sax.helpers.NamespaceSupport; + + +/** + * An enhanced SAX2 version of Microstar's Ælfred XML parser. + * The enhancements primarily relate to significant improvements in + * conformance to the XML specification, and SAX2 support. Performance + * has been improved. See the package level documentation for more + * information. + * + * <table border="1" width='100%' cellpadding='3' cellspacing='0'> + * <tr bgcolor='#ccccff'> + * <th><font size='+1'>Name</font></th> + * <th><font size='+1'>Notes</font></th></tr> + * + * <tr><td colspan=2><center><em>Features ... URL prefix is + * <b>http://xml.org/sax/features/</b></em></center></td></tr> + * + * <tr><td>(URL)/external-general-entities</td> + * <td>Value defaults to <em>true</em></td></tr> + * <tr><td>(URL)/external-parameter-entities</td> + * <td>Value defaults to <em>true</em></td></tr> + * <tr><td>(URL)/is-standalone</td> + * <td>(PRELIMINARY) Returns true iff the document's parsing + * has started (some non-error event after <em>startDocument()</em> + * was reported) and the document's standalone flag is set.</td></tr> + * <tr><td>(URL)/namespace-prefixes</td> + * <td>Value defaults to <em>false</em> (but XML 1.0 names are + * always reported)</td></tr> + * <tr><td>(URL)/lexical-handler/parameter-entities</td> + * <td>Value is fixed at <em>true</em></td></tr> + * <tr><td>(URL)/namespaces</td> + * <td>Value defaults to <em>true</em></td></tr> + * <tr><td>(URL)/resolve-dtd-uris</td> + * <td>(PRELIMINARY) Value defaults to <em>true</em></td></tr> + * <tr><td>(URL)/string-interning</td> + * <td>Value is fixed at <em>true</em></td></tr> + * <tr><td>(URL)/use-attributes2</td> + * <td>(PRELIMINARY) Value is fixed at <em>true</em></td></tr> + * <tr><td>(URL)/use-entity-resolver2</td> + * <td>(PRELIMINARY) Value defaults to <em>true</em></td></tr> + * <tr><td>(URL)/validation</td> + * <td>Value is fixed at <em>false</em></td></tr> + * + * <tr><td colspan=2><center><em>Handler Properties ... URL prefix is + * <b>http://xml.org/sax/properties/</b></em></center></td></tr> + * + * <tr><td>(URL)/declaration-handler</td> + * <td>A declaration handler may be provided. </td></tr> + * <tr><td>(URL)/lexical-handler</td> + * <td>A lexical handler may be provided. </td></tr> + * </table> + * + * <p>This parser currently implements the SAX1 Parser API, but + * it may not continue to do so in the future. + * + * @author Written by David Megginson (version 1.2a from Microstar) + * @author Updated by David Brownell <dbrownell@users.sourceforge.net> + * @see org.xml.sax.Parser + */ +final public class SAXDriver + implements Locator, Attributes2, XMLReader, Parser, AttributeList +{ + private final DefaultHandler2 base = new DefaultHandler2 (); + private XmlParser parser; + + private EntityResolver entityResolver = base; + private EntityResolver2 resolver2 = null; + private ContentHandler contentHandler = base; + private DTDHandler dtdHandler = base; + private ErrorHandler errorHandler = base; + private DeclHandler declHandler = base; + private LexicalHandler lexicalHandler = base; + + private String elementName; + private Stack entityStack; + + // one vector (of object/struct): faster, smaller + private List attributesList = Collections.synchronizedList(new ArrayList()); + + private boolean attributeSpecified [] = new boolean[10]; + private boolean attributeDeclared [] = new boolean[10]; + + private boolean namespaces = true; + private boolean xmlNames = false; + private boolean extGE = true; + private boolean extPE = true; + private boolean resolveAll = true; + private boolean useResolver2 = true; + private boolean stringInterning = true; + + private int attributeCount; + private boolean attributes; + private String nsTemp []; + private NamespaceSupport prefixStack; + + // + // Constructor. + // + + /** Constructs a SAX Parser. */ + public SAXDriver () + { + reset (); + } + + private void reset () + { + elementName = null; + entityStack = new Stack (); + attributesList = Collections.synchronizedList(new ArrayList()); + attributeSpecified = new boolean[10]; + attributeDeclared = new boolean[10]; + attributeCount = 0; + attributes = false; + nsTemp = new String[3]; + prefixStack = null; + } + + + // + // Implementation of org.xml.sax.Parser. + // + + /** + * <b>SAX1</b>: Sets the locale used for diagnostics; currently, + * only locales using the English language are supported. + * @param locale The locale for which diagnostics will be generated + */ + public void setLocale (Locale locale) + throws SAXException + { + if ("en".equals (locale.getLanguage ())) + return ; + + throw new SAXException ("AElfred2 only supports English locales."); + } + + + /** + * <b>SAX2</b>: Returns the object used when resolving external + * entities during parsing (both general and parameter entities). + */ + public EntityResolver getEntityResolver () + { + return (entityResolver == base) ? null : entityResolver; + } + + /** + * <b>SAX1, SAX2</b>: Set the entity resolver for this parser. + * @param handler The object to receive entity events. + */ + public void setEntityResolver (EntityResolver resolver) + { + if (resolver instanceof EntityResolver2) + resolver2 = (EntityResolver2) resolver; + else + resolver2 = null; + if (resolver == null) + resolver = base; + entityResolver = resolver; + } + + + /** + * <b>SAX2</b>: Returns the object used to process declarations related + * to notations and unparsed entities. + */ + public DTDHandler getDTDHandler () + { + return (dtdHandler == base) ? null : dtdHandler; + } + + /** + * <b>SAX1, SAX2</b>: Set the DTD handler for this parser. + * @param handler The object to receive DTD events. + */ + public void setDTDHandler (DTDHandler handler) + { + if (handler == null) + handler = base; + this.dtdHandler = handler; + } + + + /** + * <b>SAX1</b>: Set the document handler for this parser. If a + * content handler was set, this document handler will supplant it. + * The parser is set to report all XML 1.0 names rather than to + * filter out "xmlns" attributes (the "namespace-prefixes" feature + * is set to true). + * + * @deprecated SAX2 programs should use the XMLReader interface + * and a ContentHandler. + * + * @param handler The object to receive document events. + */ + public void setDocumentHandler (DocumentHandler handler) + { + contentHandler = new Adapter (handler); + xmlNames = true; + } + + /** + * <b>SAX2</b>: Returns the object used to report the logical + * content of an XML document. + */ + public ContentHandler getContentHandler () + { + return contentHandler == base ? null : contentHandler; + } + + /** + * <b>SAX2</b>: Assigns the object used to report the logical + * content of an XML document. If a document handler was set, + * this content handler will supplant it (but XML 1.0 style name + * reporting may remain enabled). + */ + public void setContentHandler (ContentHandler handler) + { + if (handler == null) + handler = base; + contentHandler = handler; + } + + /** + * <b>SAX1, SAX2</b>: Set the error handler for this parser. + * @param handler The object to receive error events. + */ + public void setErrorHandler (ErrorHandler handler) + { + if (handler == null) + handler = base; + this.errorHandler = handler; + } + + /** + * <b>SAX2</b>: Returns the object used to receive callbacks for XML + * errors of all levels (fatal, nonfatal, warning); this is never null; + */ + public ErrorHandler getErrorHandler () + { return errorHandler == base ? null : errorHandler; } + + + /** + * <b>SAX1, SAX2</b>: Auxiliary API to parse an XML document, used mostly + * when no URI is available. + * If you want anything useful to happen, you should set + * at least one type of handler. + * @param source The XML input source. Don't set 'encoding' unless + * you know for a fact that it's correct. + * @see #setEntityResolver + * @see #setDTDHandler + * @see #setContentHandler + * @see #setErrorHandler + * @exception SAXException The handlers may throw any SAXException, + * and the parser normally throws SAXParseException objects. + * @exception IOException IOExceptions are normally through through + * the parser if there are problems reading the source document. + */ + public void parse (InputSource source) + throws SAXException, IOException + { + synchronized (base) { + parser = new XmlParser (); + if (namespaces) + prefixStack = new NamespaceSupport (); + else if (!xmlNames) + throw new IllegalStateException (); + parser.setHandler (this); + + try { + + Reader r = source.getCharacterStream(); + InputStream in = source.getByteStream(); + + + parser.doParse (source.getSystemId (), + source.getPublicId (), + r, + in, + source.getEncoding ()); + } catch (SAXException e) { + throw e; + } catch (IOException e) { + throw e; + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new SAXParseException (e.getMessage (), this, e); + } finally { + contentHandler.endDocument (); + reset(); + } + } + } + + + /** + * <b>SAX1, SAX2</b>: Preferred API to parse an XML document, using a + * system identifier (URI). + */ + public void parse (String systemId) + throws SAXException, IOException + { + parse (new InputSource (systemId)); + } + + // + // Implementation of SAX2 "XMLReader" interface + // + static final String FEATURE = "http://xml.org/sax/features/"; + static final String PROPERTY = "http://xml.org/sax/properties/"; + + /** + * <b>SAX2</b>: Tells the value of the specified feature flag. + * + * @exception SAXNotRecognizedException thrown if the feature flag + * is neither built in, nor yet assigned. + */ + public boolean getFeature (String featureId) + throws SAXNotRecognizedException, SAXNotSupportedException + { + if ((FEATURE + "validation").equals (featureId)) + return false; + + // external entities (both types) are optionally included + if ((FEATURE + "external-general-entities").equals (featureId)) + return extGE; + if ((FEATURE + "external-parameter-entities") .equals (featureId)) + return extPE; + + // element/attribute names are as written in document; no mangling + if ((FEATURE + "namespace-prefixes").equals (featureId)) + return xmlNames; + + // report element/attribute namespaces? + if ((FEATURE + "namespaces").equals (featureId)) + return namespaces; + + // all PEs and GEs are reported + if ((FEATURE + "lexical-handler/parameter-entities").equals (featureId)) + return true; + + // default is true + if ((FEATURE + "string-interning").equals (featureId)) + return stringInterning; + + // EXTENSIONS 1.1 + + // always returns isSpecified info + if ((FEATURE + "use-attributes2").equals (featureId)) + return true; + + // meaningful between startDocument/endDocument + if ((FEATURE + "is-standalone").equals (featureId)) { + if (parser == null) + throw new SAXNotSupportedException (featureId); + return parser.isStandalone (); + } + + // optionally don't absolutize URIs in declarations + if ((FEATURE + "resolve-dtd-uris").equals (featureId)) + return resolveAll; + + // optionally use resolver2 interface methods, if possible + if ((FEATURE + "use-entity-resolver2").equals (featureId)) + return useResolver2; + + throw new SAXNotRecognizedException (featureId); + } + + // package private + DeclHandler getDeclHandler () { return declHandler; } + + // package private + boolean resolveURIs () { return resolveAll; } + + /** + * <b>SAX2</b>: Returns the specified property. + * + * @exception SAXNotRecognizedException thrown if the property value + * is neither built in, nor yet stored. + */ + public Object getProperty (String propertyId) + throws SAXNotRecognizedException + { + if ((PROPERTY + "declaration-handler").equals (propertyId)) + return declHandler == base ? null : declHandler; + + if ((PROPERTY + "lexical-handler").equals (propertyId)) + return lexicalHandler == base ? null : lexicalHandler; + + // unknown properties + throw new SAXNotRecognizedException (propertyId); + } + + /** + * <b>SAX2</b>: Sets the state of feature flags in this parser. Some + * built-in feature flags are mutable. + */ + public void setFeature (String featureId, boolean value) + throws SAXNotRecognizedException, SAXNotSupportedException + { + boolean state; + + // Features with a defined value, we just change it if we can. + state = getFeature (featureId); + + if (state == value) + return; + if (parser != null) + throw new SAXNotSupportedException ("not while parsing"); + + if ((FEATURE + "namespace-prefixes").equals (featureId)) { + // in this implementation, this only affects xmlns reporting + xmlNames = value; + // forcibly prevent illegal parser state + if (!xmlNames) + namespaces = true; + return; + } + + if ((FEATURE + "namespaces").equals (featureId)) { + namespaces = value; + // forcibly prevent illegal parser state + if (!namespaces) + xmlNames = true; + return; + } + + if ((FEATURE + "external-general-entities").equals (featureId)) { + extGE = value; + return; + } + if ((FEATURE + "external-parameter-entities") .equals (featureId)) { + extPE = value; + return; + } + if ((FEATURE + "resolve-dtd-uris").equals (featureId)) { + resolveAll = value; + return; + } + + if ((FEATURE + "use-entity-resolver2").equals (featureId)) { + useResolver2 = value; + return; + } + + throw new SAXNotRecognizedException (featureId); + } + + /** + * <b>SAX2</b>: Assigns the specified property. Like SAX1 handlers, + * these may be changed at any time. + */ + public void setProperty (String propertyId, Object value) + throws SAXNotRecognizedException, SAXNotSupportedException + { + // see if the property is recognized + getProperty (propertyId); + + // Properties with a defined value, we just change it if we can. + + if ((PROPERTY + "declaration-handler").equals (propertyId)) { + if (value == null) + declHandler = base; + else if (! (value instanceof DeclHandler)) + throw new SAXNotSupportedException (propertyId); + else + declHandler = (DeclHandler) value; + return ; + } + + if ((PROPERTY + "lexical-handler").equals (propertyId)) { + if (value == null) + lexicalHandler = base; + else if (! (value instanceof LexicalHandler)) + throw new SAXNotSupportedException (propertyId); + else + lexicalHandler = (LexicalHandler) value; + return ; + } + + throw new SAXNotSupportedException (propertyId); + } + + + // + // This is where the driver receives XmlParser callbacks and translates + // them into SAX callbacks. Some more callbacks have been added for + // SAX2 support. + // + + void startDocument () + throws SAXException + { + contentHandler.setDocumentLocator (this); + contentHandler.startDocument (); + attributesList.clear (); + } + + void xmlDecl(String version, + String encoding, + boolean standalone, + String inputEncoding) + throws SAXException + { + if (contentHandler instanceof ContentHandler2) + { + ((ContentHandler2) contentHandler).xmlDecl(version, + encoding, + standalone, + inputEncoding); + } + } + + void skippedEntity (String name) + throws SAXException + { contentHandler.skippedEntity (name); } + + InputSource getExternalSubset (String name, String baseURI) + throws SAXException, IOException + { + if (resolver2 == null || !useResolver2 || !extPE) + return null; + return resolver2.getExternalSubset (name, baseURI); + } + + InputSource resolveEntity (boolean isPE, String name, + InputSource in, String baseURI) + throws SAXException, IOException + { + InputSource source; + + // external entities might be skipped + if (isPE && !extPE) + return null; + if (!isPE && !extGE) + return null; + + // ... or not + lexicalHandler.startEntity (name); + if (resolver2 != null && useResolver2) { + source = resolver2.resolveEntity (name, in.getPublicId (), + baseURI, in.getSystemId ()); + if (source == null) { + in.setSystemId (absolutize (baseURI, + in.getSystemId (), false)); + source = in; + } + } else { + in.setSystemId (absolutize (baseURI, in.getSystemId (), false)); + source = entityResolver.resolveEntity (in.getPublicId (), + in.getSystemId ()); + if (source == null) + source = in; + } + startExternalEntity (name, source.getSystemId (), true); + return source; + } + + // absolutize a system ID relative to the specified base URI + // (temporarily) package-visible for external entity decls + String absolutize (String baseURI, String systemId, boolean nice) + throws MalformedURLException, SAXException + { + // FIXME normalize system IDs -- when? + // - Convert to UTF-8 + // - Map reserved and non-ASCII characters to %HH + + try { + if (baseURI == null) { + warn ("No base URI; hope this SYSTEM id is absolute: " + + systemId); + return new URL (systemId).toString (); + } else + return new URL (new URL (baseURI), systemId).toString (); + + } catch (MalformedURLException e) { + + // Let unknown URI schemes pass through unless we need + // the JVM to map them to i/o streams for us... + if (!nice) + throw e; + + // sometimes sysids for notations or unparsed entities + // aren't really URIs... + warn ("Can't absolutize SYSTEM id: " + e.getMessage ()); + return systemId; + } + } + + void startExternalEntity (String name, String systemId, + boolean stackOnly) + throws SAXException + { + // The following warning was deleted because the application has the + // option of not setting systemId. Sun's JAXP or Xerces seems to + // ignore this case. + /* + if (systemId == null) + warn ("URI was not reported to parser for entity " + name); + */ + if (!stackOnly) // spliced [dtd] needs startEntity + lexicalHandler.startEntity (name); + entityStack.push (systemId); + } + + void endExternalEntity (String name) + throws SAXException + { + if (!"[document]".equals (name)) + lexicalHandler.endEntity (name); + entityStack.pop (); + } + + void startInternalEntity (String name) + throws SAXException + { + lexicalHandler.startEntity (name); + } + + void endInternalEntity (String name) + throws SAXException + { + lexicalHandler.endEntity (name); + } + + void doctypeDecl (String name, String publicId, String systemId) + throws SAXException + { + lexicalHandler.startDTD (name, publicId, systemId); + + // ... the "name" is a declaration and should be given + // to the DeclHandler (but sax2 doesn't). + + // the IDs for the external subset are lexical details, + // as are the contents of the internal subset; but sax2 + // doesn't provide the internal subset "pre-parse" + } + + void notationDecl (String name, String ids []) + throws SAXException + { + try { + dtdHandler.notationDecl (name, ids [0], + (resolveAll && ids [1] != null) + ? absolutize (ids [2], ids [1], true) + : ids [1]); + } catch (IOException e) { + // "can't happen" + throw new SAXParseException (e.getMessage (), this, e); + } + } + + void unparsedEntityDecl (String name, String ids [], String notation) + throws SAXException + { + try { + dtdHandler.unparsedEntityDecl (name, ids [0], + resolveAll + ? absolutize (ids [2], ids [1], true) + : ids [1], + notation); + } catch (IOException e) { + // "can't happen" + throw new SAXParseException (e.getMessage (), this, e); + } + } + + void endDoctype () + throws SAXException + { + lexicalHandler.endDTD (); + } + + private void declarePrefix (String prefix, String uri) + throws SAXException + { + int index = uri.indexOf (':'); + + // many versions of nwalsh docbook stylesheets + // have bogus URLs; so this can't be an error... + if (index < 1 && uri.length () != 0) + warn ("relative URI for namespace: " + uri); + + // FIXME: char [0] must be ascii alpha; chars [1..index] + // must be ascii alphanumeric or in "+-." [RFC 2396] + + //Namespace Constraints + //name for xml prefix must be http://www.w3.org/XML/1998/namespace + boolean prefixEquality = prefix.equals("xml"); + boolean uriEquality = uri.equals("http://www.w3.org/XML/1998/namespace"); + if ((prefixEquality || uriEquality) && !(prefixEquality && uriEquality)) + fatal ("xml is by definition bound to the namespace name " + + "http://www.w3.org/XML/1998/namespace"); + + //xmlns prefix declaration is illegal but xml prefix declaration is llegal... + if (prefixEquality && uriEquality) + return; + + //name for xmlns prefix must be http://www.w3.org/2000/xmlns/ + prefixEquality = prefix.equals("xmlns"); + uriEquality = uri.equals("http://www.w3.org/2000/xmlns/"); + if ((prefixEquality || uriEquality) && !(prefixEquality && uriEquality)) + fatal("http://www.w3.org/2000/xmlns/ is by definition bound" + + " to prefix xmlns"); + + //even if the uri is http://www.w3.org/2000/xmlns/ it is illegal to declare it + if (prefixEquality && uriEquality) + fatal ("declaring the xmlns prefix is illegal"); + + uri = uri.intern (); + prefixStack.declarePrefix (prefix, uri); + contentHandler.startPrefixMapping (prefix, uri); + } + + void attribute (String qname, String value, boolean isSpecified) + throws SAXException + { + if (!attributes) { + attributes = true; + if (namespaces) + prefixStack.pushContext (); + } + + // process namespace decls immediately; + // then maybe forget this as an attribute + if (namespaces) { + int index; + + // default NS declaration? + if (getFeature (FEATURE + "string-interning")) { + if ("xmlns" == qname) { + declarePrefix ("", value); + if (!xmlNames) + return; + } + // NS prefix declaration? + else if ((index = qname.indexOf (':')) == 5 + && qname.startsWith ("xmlns")) { + String prefix = qname.substring (6); + + if (prefix.equals("")) + fatal ("missing prefix in namespace declaration attribute"); + if (value.length () == 0) { + verror ("missing URI in namespace declaration attribute: " + + qname); + } else + declarePrefix (prefix, value); + if (!xmlNames) + return; + } + } else { + if ("xmlns".equals(qname)) { + declarePrefix ("", value); + if (!xmlNames) + return; + } + // NS prefix declaration? + else if ((index = qname.indexOf (':')) == 5 + && qname.startsWith ("xmlns")) { + String prefix = qname.substring (6); + + if (value.length () == 0) { + verror ("missing URI in namespace decl attribute: " + + qname); + } else + declarePrefix (prefix, value); + if (!xmlNames) + return; + } + } + } + // remember this attribute ... + + if (attributeCount == attributeSpecified.length) { // grow array? + boolean temp [] = new boolean [attributeSpecified.length + 5]; + System.arraycopy (attributeSpecified, 0, temp, 0, attributeCount); + attributeSpecified = temp; + } + attributeSpecified [attributeCount] = isSpecified; + + attributeCount++; + + // attribute type comes from querying parser's DTD records + attributesList.add(new Attribute(qname, value)); + + } + + void startElement (String elname) + throws SAXException + { + ContentHandler handler = contentHandler; + + // + // NOTE: this implementation of namespace support adds something + // like six percent to parsing CPU time, in a large (~50 MB) + // document that doesn't use namespaces at all. (Measured by PC + // sampling, with a bug where endElement processing was omitted.) + // [Measurement referred to older implementation, older JVM ...] + // + // It ought to become notably faster in such cases. Most + // costs are the prefix stack calling Hashtable.get() (2%), + // String.hashCode() (1.5%) and about 1.3% each for pushing + // the context, and two chunks of name processing. + // + + if (!attributes) { + if (namespaces) + prefixStack.pushContext (); + } else if (namespaces) { + + // now we can patch up namespace refs; we saw all the + // declarations, so now we'll do the Right Thing + Iterator itt = attributesList.iterator (); + while(itt.hasNext()) + { + Attribute attribute = (Attribute) itt.next(); + String qname = attribute.name; + int index; + + // default NS declaration? + if (getFeature (FEATURE + "string-interning")) { + if ("xmlns" == qname) + continue; + } else { + if ("xmlns".equals(qname)) + continue; + } + //Illegal in the new Namespaces Draft + //should it be only in 1.1 docs?? + if (qname.equals (":")) + fatal ("namespace names consisting of a single colon " + + "character are invalid"); + index = qname.indexOf (':'); + + // NS prefix declaration? + if (index == 5 && qname.startsWith ("xmlns")) + continue; + + // it's not a NS decl; patch namespace info items + if (prefixStack.processName (qname, nsTemp, true) == null) + fatal ("undeclared attribute prefix in: " + qname); + else { + attribute.nameSpace = nsTemp[0]; + attribute.localName = nsTemp[1]; + } + } + } + + // save element name so attribute callbacks work + elementName = elname; + if (namespaces) { + if (prefixStack.processName (elname, nsTemp, false) == null) { + fatal ("undeclared element prefix in: " + elname); + nsTemp [0] = nsTemp [1] = ""; + } + handler.startElement (nsTemp [0], nsTemp [1], elname, this); + } else + handler.startElement ("", "", elname, this); + // elementName = null; + + // elements with no attributes are pretty common! + if (attributes) { + attributesList.clear(); + attributeCount = 0; + attributes = false; + } + } + + void endElement (String elname) + throws SAXException + { + ContentHandler handler = contentHandler; + + if (!namespaces) { + handler.endElement ("", "", elname); + return; + } + prefixStack.processName (elname, nsTemp, false); + handler.endElement (nsTemp [0], nsTemp [1], elname); + + Enumeration prefixes = prefixStack.getDeclaredPrefixes (); + + while (prefixes.hasMoreElements ()) + handler.endPrefixMapping ((String) prefixes.nextElement ()); + prefixStack.popContext (); + } + + void startCDATA () + throws SAXException + { + lexicalHandler.startCDATA (); + } + + void charData (char ch[], int start, int length) + throws SAXException + { + contentHandler.characters (ch, start, length); + } + + void endCDATA () + throws SAXException + { + lexicalHandler.endCDATA (); + } + + void ignorableWhitespace (char ch[], int start, int length) + throws SAXException + { + contentHandler.ignorableWhitespace (ch, start, length); + } + + void processingInstruction (String target, String data) + throws SAXException + { + contentHandler.processingInstruction (target, data); + } + + void comment (char ch[], int start, int length) + throws SAXException + { + if (lexicalHandler != base) + lexicalHandler.comment (ch, start, length); + } + + void fatal (String message) + throws SAXException + { + SAXParseException fatal; + + fatal = new SAXParseException (message, this); + errorHandler.fatalError (fatal); + + // Even if the application can continue ... we can't! + throw fatal; + } + + // We can safely report a few validity errors that + // make layered SAX2 DTD validation more conformant + void verror (String message) + throws SAXException + { + SAXParseException err; + + err = new SAXParseException (message, this); + errorHandler.error (err); + } + + void warn (String message) + throws SAXException + { + SAXParseException err; + + err = new SAXParseException (message, this); + errorHandler.warning (err); + } + + + // + // Implementation of org.xml.sax.Attributes. + // + + /** + * <b>SAX1 AttributeList, SAX2 Attributes</b> method + * (don't invoke on parser); + */ + public int getLength () + { + return attributesList.size (); + } + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public String getURI (int index) + { + return ((Attribute) attributesList.get (index)).nameSpace; + } + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public String getLocalName (int index) + { + Attribute attr = (Attribute) attributesList.get (index); + // FIXME attr.localName is sometimes null, why? + if (namespaces && attr.localName == null) + { + // XXX fix this here for now + int ci = attr.name.indexOf(':'); + attr.localName = (ci == -1) ? attr.name : + attr.name.substring(ci + 1); + } + return attr.localName; + } + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public String getQName (int i) + { + return ((Attribute) attributesList.get (i)).name; + } + + /** + * <b>SAX1 AttributeList</b> method (don't invoke on parser); + */ + public String getName (int i) + { + return ((Attribute) attributesList.get (i)).name; + } + + /** + * <b>SAX1 AttributeList, SAX2 Attributes</b> method + * (don't invoke on parser); + */ + public String getType (int i) + { + String type = parser.getAttributeType (elementName, getQName (i)); + if (type == null) + return "CDATA"; + // ... use DeclHandler.attributeDecl to see enumerations + if (type == "ENUMERATION") + return "NMTOKEN"; + return type; + } + + + /** + * <b>SAX1 AttributeList, SAX2 Attributes</b> method + * (don't invoke on parser); + */ + public String getValue (int i) + { + return ((Attribute) attributesList.get (i)).value; + } + + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public int getIndex (String uri, String local) + { + int length = getLength (); + + for (int i = 0; i < length; i++) { + if (!getURI (i).equals (uri)) + continue; + if (getLocalName (i).equals (local)) + return i; + } + return -1; + } + + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public int getIndex (String xmlName) + { + int length = getLength (); + + for (int i = 0; i < length; i++) { + if (getQName (i).equals (xmlName)) + return i; + } + return -1; + } + + + /** + * <b>SAX2 Attributes</b> method (don't invoke on parser); + */ + public String getType (String uri, String local) + { + int index = getIndex (uri, local); + + if (index < 0) + return null; + return getType (index); + } + + + /** + * <b>SAX1 AttributeList, SAX2 Attributes</b> method + * (don't invoke on parser); + */ + public String getType (String xmlName) + { + int index = getIndex (xmlName); + + if (index < 0) + return null; + return getType (index); + } + + + /** + * <b>SAX Attributes</b> method (don't invoke on parser); + */ + public String getValue (String uri, String local) + { + int index = getIndex (uri, local); + + if (index < 0) + return null; + return getValue (index); + } + + + /** + * <b>SAX1 AttributeList, SAX2 Attributes</b> method + * (don't invoke on parser); + */ + public String getValue (String xmlName) + { + int index = getIndex (xmlName); + + if (index < 0) + return null; + return getValue (index); + } + + + // + // Implementation of org.xml.sax.ext.Attributes2 + // + + + /** @return false unless the attribute was declared in the DTD. + * @throws java.lang.ArrayIndexOutOfBoundsException + * When the supplied index does not identify an attribute. + */ + public boolean isDeclared (int index) + { + if (index < 0 || index >= attributeCount) + throw new ArrayIndexOutOfBoundsException (); + return attributeDeclared [index]; + } + + /** @return false unless the attribute was declared in the DTD. + * @throws java.lang.IllegalArgumentException + * When the supplied names do not identify an attribute. + */ + public boolean isDeclared (java.lang.String qName) + { + int index = getIndex (qName); + if (index < 0) + throw new IllegalArgumentException (); + return attributeDeclared [index]; + } + + /** @return false unless the attribute was declared in the DTD. + * @throws java.lang.IllegalArgumentException + * When the supplied names do not identify an attribute. + */ + public boolean isDeclared (java.lang.String uri, java.lang.String localName) + { + int index = getIndex (uri, localName); + if (index < 0) + throw new IllegalArgumentException (); + return attributeDeclared [index]; + } + + + /** + * <b>SAX-ext Attributes2</b> method (don't invoke on parser); + */ + public boolean isSpecified (int index) + { + if (index < 0 || index >= attributeCount) + throw new ArrayIndexOutOfBoundsException (); + return attributeSpecified [index]; + } + + /** + * <b>SAX-ext Attributes2</b> method (don't invoke on parser); + */ + public boolean isSpecified (String uri, String local) + { + int index = getIndex (uri, local); + + if (index < 0) + throw new IllegalArgumentException (); + return attributeSpecified [index]; + } + + /** + * <b>SAX-ext Attributes2</b> method (don't invoke on parser); + */ + public boolean isSpecified (String xmlName) + { + int index = getIndex (xmlName); + + if (index < 0) + throw new IllegalArgumentException (); + return attributeSpecified [index]; + } + + + // + // Implementation of org.xml.sax.Locator. + // + + /** + * <b>SAX Locator</b> method (don't invoke on parser); + */ + public String getPublicId () + { + return null; // FIXME track public IDs too + } + + /** + * <b>SAX Locator</b> method (don't invoke on parser); + */ + public String getSystemId () + { + if (entityStack.empty ()) + return null; + else + return (String) entityStack.peek (); + } + + /** + * <b>SAX Locator</b> method (don't invoke on parser); + */ + public int getLineNumber () + { + return parser.getLineNumber (); + } + + /** + * <b>SAX Locator</b> method (don't invoke on parser); + */ + public int getColumnNumber () + { + return parser.getColumnNumber (); + } + + // adapter between SAX2 content handler and SAX1 document handler callbacks + private static class Adapter implements ContentHandler + { + private DocumentHandler docHandler; + + Adapter (DocumentHandler dh) + { docHandler = dh; } + + + public void setDocumentLocator (Locator l) + { docHandler.setDocumentLocator (l); } + + public void startDocument () throws SAXException + { docHandler.startDocument (); } + + public void processingInstruction (String target, String data) + throws SAXException + { docHandler.processingInstruction (target, data); } + + public void startPrefixMapping (String prefix, String uri) + { /* ignored */ } + + public void startElement ( + String namespace, + String local, + String name, + Attributes attrs + ) throws SAXException + { docHandler.startElement (name, (AttributeList) attrs); } + + public void characters (char buf [], int offset, int len) + throws SAXException + { docHandler.characters (buf, offset, len); } + + public void ignorableWhitespace (char buf [], int offset, int len) + throws SAXException + { docHandler.ignorableWhitespace (buf, offset, len); } + + public void skippedEntity (String name) + { /* ignored */ } + + public void endElement (String u, String l, String name) + throws SAXException + { docHandler.endElement (name); } + + public void endPrefixMapping (String prefix) + { /* ignored */ } + + public void endDocument () throws SAXException + { docHandler.endDocument (); } + } +} + +class Attribute +{ + + String name; + String value; + String nameSpace; + String localName; + + Attribute(String name, String value) + { + this.name = name; + this.value = value; + this.nameSpace = ""; + } +} + diff --git a/libjava/gnu/xml/aelfred2/XmlParser.java b/libjava/gnu/xml/aelfred2/XmlParser.java new file mode 100644 index 0000000..f4abf22 --- /dev/null +++ b/libjava/gnu/xml/aelfred2/XmlParser.java @@ -0,0 +1,5113 @@ +/* XmlParser.java -- + Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. + +Partly derived from code which carried the following notice: + + Copyright (c) 1997, 1998 by Microstar Software Ltd. + + AElfred is free for both commercial and non-commercial use and + redistribution, provided that Microstar's copyright and disclaimer are + retained intact. You are free to modify AElfred for your own use and + to redistribute AElfred with your modifications, provided that the + modifications are clearly documented. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + merchantability or fitness for a particular purpose. Please use it AT + YOUR OWN RISK. +*/ + +package gnu.xml.aelfred2; + +import java.io.BufferedInputStream; +import java.io.CharConversionException; +import java.io.EOFException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; + +// maintaining 1.1 compatibility for now ... +// Iterator and Hashmap ought to be faster +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Stack; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + + +/** + * Parse XML documents and return parse events through call-backs. + * Use the <code>SAXDriver</code> class as your entry point, as all + * internal parser interfaces are subject to change. + * + * @author Written by David Megginson <dmeggins@microstar.com> + * (version 1.2a with bugfixes) + * @author Updated by David Brownell <dbrownell@users.sourceforge.net> + * @see SAXDriver + */ +final class XmlParser +{ + // avoid slow per-character readCh() + private final static boolean USE_CHEATS = true; + + + ////////////////////////////////////////////////////////////////////// + // Constructors. + //////////////////////////////////////////////////////////////////////// + + + /** + * Construct a new parser with no associated handler. + * @see #setHandler + * @see #parse + */ + // package private + XmlParser () + { + } + + + /** + * Set the handler that will receive parsing events. + * @param handler The handler to receive callback events. + * @see #parse + */ + // package private + void setHandler (SAXDriver handler) + { + this.handler = handler; + } + + + /** + * Parse an XML document from the character stream, byte stream, or URI + * that you provide (in that order of preference). Any URI that you + * supply will become the base URI for resolving relative URI, and may + * be used to acquire a reader or byte stream. + * + * <p> Only one thread at a time may use this parser; since it is + * private to this package, post-parse cleanup is done by the caller, + * which MUST NOT REUSE the parser (just null it). + * + * @param systemId Absolute URI of the document; should never be null, + * but may be so iff a reader <em>or</em> a stream is provided. + * @param publicId The public identifier of the document, or null. + * @param reader A character stream; must be null if stream isn't. + * @param stream A byte input stream; must be null if reader isn't. + * @param encoding The suggested encoding, or null if unknown. + * @exception java.lang.Exception Basically SAXException or IOException + */ + // package private + void doParse ( + String systemId, + String publicId, + Reader reader, + InputStream stream, + String encoding + ) throws Exception + { + if (handler == null) + throw new IllegalStateException ("no callback handler"); + + initializeVariables (); + + // predeclare the built-in entities here (replacement texts) + // we don't need to intern(), since we're guaranteed literals + // are always (globally) interned. + setInternalEntity ("amp", "&"); + setInternalEntity ("lt", "<"); + setInternalEntity ("gt", ">"); + setInternalEntity ("apos", "'"); + setInternalEntity ("quot", """); + + try { + // pushURL first to ensure locator is correct in startDocument + // ... it might report an IO or encoding exception. + handler.startDocument (); + pushURL (false, "[document]", + // default baseURI: null + new String [] { publicId, systemId, null}, + reader, stream, encoding, false); + + parseDocument (); + } catch (EOFException e){ + //empty input + error("empty document, with no root element."); + }finally { + if (reader != null) + try { reader.close (); + } catch (IOException e) { /* ignore */ } + if (stream != null) + try { stream.close (); + } catch (IOException e) { /* ignore */ } + if (is != null) + try { is.close (); + } catch (IOException e) { /* ignore */ } + if (reader != null) + try { + reader.close (); + } catch (IOException e) { /* ignore */ + } + scratch = null; + } + } + + + //////////////////////////////////////////////////////////////////////// + // Constants. + //////////////////////////////////////////////////////////////////////// + + // + // Constants for element content type. + // + + /** + * Constant: an element has not been declared. + * @see #getElementContentType + */ + public final static int CONTENT_UNDECLARED = 0; + + /** + * Constant: the element has a content model of ANY. + * @see #getElementContentType + */ + public final static int CONTENT_ANY = 1; + + /** + * Constant: the element has declared content of EMPTY. + * @see #getElementContentType + */ + public final static int CONTENT_EMPTY = 2; + + /** + * Constant: the element has mixed content. + * @see #getElementContentType + */ + public final static int CONTENT_MIXED = 3; + + /** + * Constant: the element has element content. + * @see #getElementContentType + */ + public final static int CONTENT_ELEMENTS = 4; + + + // + // Constants for the entity type. + // + + /** + * Constant: the entity has not been declared. + * @see #getEntityType + */ + public final static int ENTITY_UNDECLARED = 0; + + /** + * Constant: the entity is internal. + * @see #getEntityType + */ + public final static int ENTITY_INTERNAL = 1; + + /** + * Constant: the entity is external, non-parsable data. + * @see #getEntityType + */ + public final static int ENTITY_NDATA = 2; + + /** + * Constant: the entity is external XML data. + * @see #getEntityType + */ + public final static int ENTITY_TEXT = 3; + + + // + // Attribute type constants are interned literal strings. + // + + // + // Constants for supported encodings. "external" is just a flag. + // + private final static int ENCODING_EXTERNAL = 0; + private final static int ENCODING_UTF_8 = 1; + private final static int ENCODING_ISO_8859_1 = 2; + private final static int ENCODING_UCS_2_12 = 3; + private final static int ENCODING_UCS_2_21 = 4; + private final static int ENCODING_UCS_4_1234 = 5; + private final static int ENCODING_UCS_4_4321 = 6; + private final static int ENCODING_UCS_4_2143 = 7; + private final static int ENCODING_UCS_4_3412 = 8; + private final static int ENCODING_ASCII = 9; + + + // + // Constants for attribute default value. + // + + /** + * Constant: the attribute is not declared. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; + + /** + * Constant: the attribute has a literal default value specified. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; + + /** + * Constant: the attribute was declared #IMPLIED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; + + /** + * Constant: the attribute was declared #REQUIRED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; + + /** + * Constant: the attribute was declared #FIXED. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_FIXED = 34; + + + // + // Constants for input. + // + private final static int INPUT_NONE = 0; + private final static int INPUT_INTERNAL = 1; + private final static int INPUT_STREAM = 3; + private final static int INPUT_READER = 5; + + + // + // Flags for reading literals. + // + // expand general entity refs (attribute values in dtd and content) + private final static int LIT_ENTITY_REF = 2; + // normalize this value (space chars) (attributes, public ids) + private final static int LIT_NORMALIZE = 4; + // literal is an attribute value + private final static int LIT_ATTRIBUTE = 8; + // don't expand parameter entities + private final static int LIT_DISABLE_PE = 16; + // don't expand [or parse] character refs + private final static int LIT_DISABLE_CREF = 32; + // don't parse general entity refs + private final static int LIT_DISABLE_EREF = 64; + // literal is a public ID value + private final static int LIT_PUBID = 256; + + + // + // Flags affecting PE handling in DTDs (if expandPE is true). + // PEs expand with space padding, except inside literals. + // + private final static int CONTEXT_NORMAL = 0; + private final static int CONTEXT_LITERAL = 1; + + + ////////////////////////////////////////////////////////////////////// + // Error reporting. + ////////////////////////////////////////////////////////////////////// + + + /** + * Report an error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + * @see SAXDriver#error + * @see #line + */ + private void error (String message, String textFound, String textExpected) + throws SAXException + { + if (textFound != null) { + message = message + " (found \"" + textFound + "\")"; + } + if (textExpected != null) { + message = message + " (expected \"" + textExpected + "\")"; + } + handler.fatal (message); + + // "can't happen" + throw new SAXException (message); + } + + + /** + * Report a serious error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + */ + private void error (String message, char textFound, String textExpected) + throws SAXException + { + error (message, new Character (textFound).toString (), textExpected); + } + + /** Report typical case fatal errors. */ + private void error (String message) + throws SAXException + { + handler.fatal (message); + } + + + ////////////////////////////////////////////////////////////////////// + // Major syntactic productions. + ////////////////////////////////////////////////////////////////////// + + + /** + * Parse an XML document. + * <pre> + * [1] document ::= prolog element Misc* + * </pre> + * <p>This is the top-level parsing function for a single XML + * document. As a minimum, a well-formed document must have + * a document element, and a valid document must have a prolog + * (one with doctype) as well. + */ + private void parseDocument () + throws Exception + { + try { // added by MHK + boolean sawDTD = parseProlog (); + require ('<'); + parseElement (!sawDTD); + } catch (EOFException ee) { // added by MHK + error("premature end of file", "[EOF]", null); + } + + try { + parseMisc (); //skip all white, PIs, and comments + char c = readCh (); //if this doesn't throw an exception... + error ("unexpected characters after document end", c, null); + } catch (EOFException e) { + return; + } + } + + static final char startDelimComment [] = { '<', '!', '-', '-' }; + static final char endDelimComment [] = { '-', '-' }; + + /** + * Skip a comment. + * <pre> + * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" + * </pre> + * <p> (The <code><!--</code> has already been read.) + */ + private void parseComment () + throws Exception + { + char c; + boolean saved = expandPE; + + expandPE = false; + parseUntil (endDelimComment); + require ('>'); + expandPE = saved; + handler.comment (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + + static final char startDelimPI [] = { '<', '?' }; + static final char endDelimPI [] = { '?', '>' }; + + /** + * Parse a processing instruction and do a call-back. + * <pre> + * [16] PI ::= '<?' PITarget + * (S (Char* - (Char* '?>' Char*)))? + * '?>' + * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) + * </pre> + * <p> (The <code><?</code> has already been read.) + */ + private void parsePI () + throws SAXException, IOException + { + String name; + boolean saved = expandPE; + + expandPE = false; + name = readNmtoken (true); + //NE08 + if (name.indexOf(':') >= 0) + error ("Illegal character(':') in processing instruction name ", name, null); + if ("xml".equalsIgnoreCase (name)) + error ("Illegal processing instruction target", name, null); + if (!tryRead (endDelimPI)) { + requireWhitespace (); + parseUntil (endDelimPI); + } + expandPE = saved; + handler.processingInstruction (name, dataBufferToString ()); + } + + + static final char endDelimCDATA [] = { ']', ']', '>' }; + + private boolean isDirtyCurrentElement; + + /** + * Parse a CDATA section. + * <pre> + * [18] CDSect ::= CDStart CData CDEnd + * [19] CDStart ::= '<![CDATA[' + * [20] CData ::= (Char* - (Char* ']]>' Char*)) + * [21] CDEnd ::= ']]>' + * </pre> + * <p> (The '<![CDATA[' has already been read.) + */ + private void parseCDSect () + throws Exception + { + parseUntil (endDelimCDATA); + dataBufferFlush (); + } + + + /** + * Parse the prolog of an XML document. + * <pre> + * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? + * </pre> + * <p>We do not look for the XML declaration here, because it was + * handled by pushURL (). + * @see pushURL + * @return true if a DTD was read. + */ + private boolean parseProlog () + throws Exception + { + parseMisc (); + + if (tryRead ("<!DOCTYPE")) { + parseDoctypedecl (); + parseMisc (); + return true; + } + return false; + } + + private void checkLegalVersion (String version) + throws SAXException + { + int len = version.length (); + for (int i = 0; i < len; i++) { + char c = version.charAt (i); + if ('0' <= c && c <= '9') + continue; + if (c == '_' || c == '.' || c == ':' || c == '-') + continue; + if ('a' <= c && c <= 'z') + continue; + if ('A' <= c && c <= 'Z') + continue; + error ("illegal character in version", version, "1.0"); + } + } + + + /** + * Parse the XML declaration. + * <pre> + * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + * [24] VersionInfo ::= S 'version' Eq + * ("'" VersionNum "'" | '"' VersionNum '"' ) + * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* + * [32] SDDecl ::= S 'standalone' Eq + * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) + * [80] EncodingDecl ::= S 'encoding' Eq + * ( "'" EncName "'" | "'" EncName "'" ) + * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + * </pre> + * <p> (The <code><?xml</code> and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseTextDecl + * @see #setupDecoding + */ + private String parseXMLDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + String version; + String encodingName = null; + String standalone = null; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + String inputEncoding = null; + + switch (this.encoding) + { + case ENCODING_EXTERNAL: + case ENCODING_UTF_8: + inputEncoding = "UTF-8"; + break; + case ENCODING_ISO_8859_1: + inputEncoding = "ISO-8859-1"; + break; + case ENCODING_UCS_2_12: + inputEncoding = "UTF-16BE"; + break; + case ENCODING_UCS_2_21: + inputEncoding = "UTF-16LE"; + break; + } + + // Read the version. + require ("version"); + parseEq (); + checkLegalVersion (version = readLiteral (flags)); + if (!version.equals ("1.0")){ + if(version.equals ("1.1")){ + handler.warn ("expected XML version 1.0, not: " + version); + xmlVersion = XML_11; + }else { + error("illegal XML version", version, "1.0 or 1.1"); + } + } + else + xmlVersion = XML_10; + // Try reading an encoding declaration. + boolean white = tryWhitespace (); + + if (tryRead ("encoding")) { + if (!white) + error ("whitespace required before 'encoding='"); + parseEq (); + encodingName = readLiteral (flags); + if (!ignoreEncoding) + setupDecoding (encodingName); + } + + // Try reading a standalone declaration + if (encodingName != null) + white = tryWhitespace (); + if (tryRead ("standalone")) { + if (!white) + error ("whitespace required before 'standalone='"); + parseEq (); + standalone = readLiteral (flags); + if ("yes".equals (standalone)) + docIsStandalone = true; + else if (!"no".equals (standalone)) + error ("standalone flag must be 'yes' or 'no'"); + } + + skipWhitespace (); + require ("?>"); + + if (inputEncoding == null) + { + inputEncoding = encodingName; + } + handler.xmlDecl(version, encodingName, "yes".equals(standalone), + inputEncoding); + + return encodingName; + } + + + /** + * Parse a text declaration. + * <pre> + * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' + * [80] EncodingDecl ::= S 'encoding' Eq + * ( '"' EncName '"' | "'" EncName "'" ) + * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + * </pre> + * <p> (The <code><?xml</code>' and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseXMLDecl + * @see #setupDecoding + */ + private String parseTextDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + String encodingName = null; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + // Read an optional version. + if (tryRead ("version")) { + String version; + parseEq (); + checkLegalVersion (version = readLiteral (flags)); + + if (version.equals ("1.1")){ + if (xmlVersion == XML_10){ + error ("external subset has later version number.", "1.0", version); + } + handler.warn ("expected XML version 1.0, not: " + version); + xmlVersion = XML_11; + }else if(!version.equals ("1.0")) { + error("illegal XML version", version, "1.0 or 1.1"); + } + requireWhitespace (); + } + + + // Read the encoding. + require ("encoding"); + parseEq (); + encodingName = readLiteral (flags); + if (!ignoreEncoding) + setupDecoding (encodingName); + + skipWhitespace (); + require ("?>"); + + return encodingName; + } + + + /** + * Sets up internal state so that we can decode an entity using the + * specified encoding. This is used when we start to read an entity + * and we have been given knowledge of its encoding before we start to + * read any data (e.g. from a SAX input source or from a MIME type). + * + * <p> It is also used after autodetection, at which point only very + * limited adjustments to the encoding may be used (switching between + * related builtin decoders). + * + * @param encodingName The name of the encoding specified by the user. + * @exception IOException if the encoding isn't supported either + * internally to this parser, or by the hosting JVM. + * @see #parseXMLDecl + * @see #parseTextDecl + */ + private void setupDecoding (String encodingName) + throws SAXException, IOException + { + encodingName = encodingName.toUpperCase (); + + // ENCODING_EXTERNAL indicates an encoding that wasn't + // autodetected ... we can use builtin decoders, or + // ones from the JVM (InputStreamReader). + + // Otherwise we can only tweak what was autodetected, and + // only for single byte (ASCII derived) builtin encodings. + + // ASCII-derived encodings + if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { + if (encodingName.equals ("ISO-8859-1") + || encodingName.equals ("8859_1") + || encodingName.equals ("ISO8859_1") + ) { + encoding = ENCODING_ISO_8859_1; + return; + } else if (encodingName.equals ("US-ASCII") + || encodingName.equals ("ASCII")) { + encoding = ENCODING_ASCII; + return; + } else if (encodingName.equals ("UTF-8") + || encodingName.equals ("UTF8")) { + encoding = ENCODING_UTF_8; + return; + } else if (encoding != ENCODING_EXTERNAL) { + // used to start with a new reader ... + throw new UnsupportedEncodingException (encodingName); + } + // else fallthrough ... + // it's ASCII-ish and something other than a builtin + } + + // Unicode and such + if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { + if (!(encodingName.equals ("ISO-10646-UCS-2") + || encodingName.equals ("UTF-16") + || encodingName.equals ("UTF-16BE") + || encodingName.equals ("UTF-16LE"))) + error ("unsupported Unicode encoding", + encodingName, + "UTF-16"); + return; + } + + // four byte encodings + if (encoding == ENCODING_UCS_4_1234 + || encoding == ENCODING_UCS_4_4321 + || encoding == ENCODING_UCS_4_2143 + || encoding == ENCODING_UCS_4_3412) { + // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists + if (!encodingName.equals ("ISO-10646-UCS-4")) + error ("unsupported 32-bit encoding", + encodingName, + "ISO-10646-UCS-4"); + return; + } + + // assert encoding == ENCODING_EXTERNAL + // if (encoding != ENCODING_EXTERNAL) + // throw new RuntimeException ("encoding = " + encoding); + + if (encodingName.equals ("UTF-16BE")) { + encoding = ENCODING_UCS_2_12; + return; + } + if (encodingName.equals ("UTF-16LE")) { + encoding = ENCODING_UCS_2_21; + return; + } + + // We couldn't use the builtin decoders at all. But we can try to + // create a reader, since we haven't messed up buffering. Tweak + // the encoding name if necessary. + + if (encodingName.equals ("UTF-16") + || encodingName.equals ("ISO-10646-UCS-2")) + encodingName = "Unicode"; + // Ignoring all the EBCDIC aliases here + + reader = new InputStreamReader (is, encodingName); + sourceType = INPUT_READER; + } + + + /** + * Parse miscellaneous markup outside the document element and DOCTYPE + * declaration. + * <pre> + * [27] Misc ::= Comment | PI | S + * </pre> + */ + private void parseMisc () + throws Exception + { + while (true) { + skipWhitespace (); + if (tryRead (startDelimPI)) { + parsePI (); + } else if (tryRead (startDelimComment)) { + parseComment (); + } else { + return; + } + } + } + + + /** + * Parse a document type declaration. + * <pre> + * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? + * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' + * </pre> + * <p> (The <code><!DOCTYPE</code> has already been read.) + */ + private void parseDoctypedecl () + throws Exception + { + String rootName, ids[]; + + // Read the document type name. + requireWhitespace (); + rootName = readNmtoken (true); + + // Read the External subset's IDs + skipWhitespace (); + ids = readExternalIds (false, true); + + // report (a) declaration of name, (b) lexical info (ids) + handler.doctypeDecl (rootName, ids [0], ids [1]); + + // Internal subset is parsed first, if present + skipWhitespace (); + if (tryRead ('[')) { + + // loop until the subset ends + while (true) { + doReport = expandPE = true; + skipWhitespace (); + doReport = expandPE = false; + if (tryRead (']')) { + break; // end of subset + } else { + // WFC, PEs in internal subset (only between decls) + peIsError = expandPE = true; + parseMarkupdecl (); + peIsError = expandPE = false; + } + } + } + skipWhitespace (); + require ('>'); + + // Read the external subset, if any + InputSource subset; + + if (ids [1] == null) + subset = handler.getExternalSubset (rootName, + handler.getSystemId ()); + else + subset = null; + if (ids [1] != null || subset != null) { + pushString (null, ">"); + + // NOTE: [dtd] is so we say what SAX2 expects, + // though it's misleading (subset, not entire dtd) + if (ids [1] != null) + pushURL (true, "[dtd]", ids, null, null, null, true); + else { + handler.warn ("modifying document by adding external subset"); + pushURL (true, "[dtd]", + new String [] { subset.getPublicId (), + subset.getSystemId (), null }, + subset.getCharacterStream (), + subset.getByteStream (), + subset.getEncoding (), + false); + } + + // Loop until we end up back at '>' + while (true) { + doReport = expandPE = true; + skipWhitespace (); + doReport = expandPE = false; + if (tryRead ('>')) { + break; + } else { + expandPE = true; + parseMarkupdecl (); + expandPE = false; + } + } + + // the ">" string isn't popped yet + if (inputStack.size () != 1) + error ("external subset has unmatched '>'"); + } + + // done dtd + handler.endDoctype (); + expandPE = false; + doReport = true; + } + + + /** + * Parse a markup declaration in the internal or external DTD subset. + * <pre> + * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl + * | NotationDecl | PI | Comment + * [30] extSubsetDecl ::= (markupdecl | conditionalSect + * | PEReference | S) * + * </pre> + * <p> Reading toplevel PE references is handled as a lexical issue + * by the caller, as is whitespace. + */ + private void parseMarkupdecl () + throws Exception + { + char saved [] = null; + boolean savedPE = expandPE; + + // prevent "<%foo;" and ensures saved entity is right + require ('<'); + unread ('<'); + expandPE = false; + + if (tryRead ("<!ELEMENT")) { + saved = readBuffer; + expandPE = savedPE; + parseElementDecl (); + } else if (tryRead ("<!ATTLIST")) { + saved = readBuffer; + expandPE = savedPE; + parseAttlistDecl (); + } else if (tryRead ("<!ENTITY")) { + saved = readBuffer; + expandPE = savedPE; + parseEntityDecl (); + } else if (tryRead ("<!NOTATION")) { + saved = readBuffer; + expandPE = savedPE; + parseNotationDecl (); + } else if (tryRead (startDelimPI)) { + saved = readBuffer; + expandPE = savedPE; + parsePI (); + } else if (tryRead (startDelimComment)) { + saved = readBuffer; + expandPE = savedPE; + parseComment (); + } else if (tryRead ("<![")) { + saved = readBuffer; + expandPE = savedPE; + if (inputStack.size () > 0) + parseConditionalSect (saved); + else + error ("conditional sections illegal in internal subset"); + } else { + error ("expected markup declaration"); + } + + // VC: Proper Decl/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Declaration/PE nesting"); + } + + + /** + * Parse an element, with its tags. + * <pre> + * [39] element ::= EmptyElementTag | STag content ETag + * [40] STag ::= '<' Name (S Attribute)* S? '>' + * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' + * </pre> + * <p> (The '<' has already been read.) + * <p>NOTE: this method actually chains onto parseContent (), if necessary, + * and parseContent () will take care of calling parseETag (). + */ + private void parseElement (boolean maybeGetSubset) + throws Exception + { + String gi; + char c; + int oldElementContent = currentElementContent; + String oldElement = currentElement; + Object element []; + + // This is the (global) counter for the + // array of specified attributes. + tagAttributePos = 0; + + // Read the element type name. + gi = readNmtoken (true); + + // If we saw no DTD, and this is the document root element, + // let the application modify the input stream by providing one. + if (maybeGetSubset) { + InputSource subset = handler.getExternalSubset (gi, + handler.getSystemId ()); + if (subset != null) { + String publicId = subset.getPublicId (); + String systemId = subset.getSystemId (); + + handler.warn ("modifying document by adding DTD"); + handler.doctypeDecl (gi, publicId, systemId); + pushString (null, ">"); + + // NOTE: [dtd] is so we say what SAX2 expects, + // though it's misleading (subset, not entire dtd) + pushURL (true, "[dtd]", + new String [] { publicId, systemId, null }, + subset.getCharacterStream (), + subset.getByteStream (), + subset.getEncoding (), + false); + + // Loop until we end up back at '>' + while (true) { + doReport = expandPE = true; + skipWhitespace (); + doReport = expandPE = false; + if (tryRead ('>')) { + break; + } else { + expandPE = true; + parseMarkupdecl (); + expandPE = false; + } + } + + // the ">" string isn't popped yet + if (inputStack.size () != 1) + error ("external subset has unmatched '>'"); + + handler.endDoctype (); + } + } + + // Determine the current content type. + currentElement = gi; + element = (Object []) elementInfo.get (gi); + currentElementContent = getContentType (element, CONTENT_ANY); + + // Read the attributes, if any. + // After this loop, "c" is the closing delimiter. + boolean white = tryWhitespace (); + c = readCh (); + while (c != '/' && c != '>') { + unread (c); + if (!white) + error ("need whitespace between attributes"); + parseAttribute (gi); + white = tryWhitespace (); + c = readCh (); + } + + // Supply any defaulted attributes. + Enumeration atts = declaredAttributes (element); + if (atts != null) { + String aname; +loop: + while (atts.hasMoreElements ()) { + aname = (String) atts.nextElement (); + // See if it was specified. + for (int i = 0; i < tagAttributePos; i++) { + if (tagAttributes [i] == aname) { + continue loop; + } + } + // ... or has a default + String value = getAttributeDefaultValue (gi, aname); + + if (value == null) + continue; + handler.attribute (aname, value, false); + } + } + + // Figure out if this is a start tag + // or an empty element, and dispatch an + // event accordingly. + switch (c) { + case '>': + handler.startElement (gi); + parseContent (); + break; + case '/': + require ('>'); + handler.startElement (gi); + handler.endElement (gi); + break; + } + + // Restore the previous state. + currentElement = oldElement; + currentElementContent = oldElementContent; + } + + + /** + * Parse an attribute assignment. + * <pre> + * [41] Attribute ::= Name Eq AttValue + * </pre> + * @param name The name of the attribute's element. + * @see SAXDriver#attribute + */ + private void parseAttribute (String name) + throws Exception + { + String aname; + String type; + String value; + int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; + + // Read the attribute name. + aname = readNmtoken (true); + type = getAttributeType (name, aname); + + // Parse '=' + parseEq (); + + // Read the value, normalizing whitespace + // unless it is CDATA. + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if (type == "CDATA" || type == null) { + value = readLiteral (flags); + } else { + value = readLiteral (flags | LIT_NORMALIZE); + } + } else { + if (type.equals("CDATA") || type == null) { + value = readLiteral (flags); + } else { + value = readLiteral (flags | LIT_NORMALIZE); + } + } + + // WFC: no duplicate attributes + for (int i = 0; i < tagAttributePos; i++) + if (aname.equals (tagAttributes [i])) + error ("duplicate attribute", aname, null); + + // Inform the handler about the + // attribute. + handler.attribute (aname, value, true); + dataBufferPos = 0; + + // Note that the attribute has been + // specified. + if (tagAttributePos == tagAttributes.length) { + String newAttrib[] = new String [tagAttributes.length * 2]; + System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos); + tagAttributes = newAttrib; + } + tagAttributes [tagAttributePos++] = aname; + } + + + /** + * Parse an equals sign surrounded by optional whitespace. + * <pre> + * [25] Eq ::= S? '=' S? + * </pre> + */ + private void parseEq () + throws SAXException, IOException + { + skipWhitespace (); + require ('='); + skipWhitespace (); + } + + + /** + * Parse an end tag. + * <pre> + * [42] ETag ::= '</' Name S? '>' + * </pre> + * <p>NOTE: parseContent () chains to here, we already read the + * "</". + */ + private void parseETag () + throws Exception + { + require (currentElement); + skipWhitespace (); + require ('>'); + handler.endElement (currentElement); + // not re-reporting any SAXException re bogus end tags, + // even though that diagnostic might be clearer ... + } + + + /** + * Parse the content of an element. + * <pre> + * [43] content ::= (element | CharData | Reference + * | CDSect | PI | Comment)* + * [67] Reference ::= EntityRef | CharRef + * </pre> + * <p> NOTE: consumes ETtag. + */ + private void parseContent () + throws Exception + { + char c; + + while (true) { + // consume characters (or ignorable whitspace) until delimiter + parseCharData (); + + // Handle delimiters + c = readCh (); + switch (c) { + + case '&': // Found "&" + c = readCh (); + if (c == '#') { + parseCharRef (); + } else { + unread (c); + parseEntityRef (true); + } + isDirtyCurrentElement = true; + break; + + case '<': // Found "<" + dataBufferFlush (); + c = readCh (); + switch (c) { + case '!': // Found "<!" + c = readCh (); + switch (c) { + case '-': // Found "<!-" + require ('-'); + isDirtyCurrentElement = false; + parseComment (); + break; + case '[': // Found "<![" + isDirtyCurrentElement = false; + require ("CDATA["); + handler.startCDATA (); + inCDATA = true; + parseCDSect (); + inCDATA = false; + handler.endCDATA (); + break; + default: + error ("expected comment or CDATA section", c, null); + break; + } + break; + + case '?': // Found "<?" + isDirtyCurrentElement = false; + parsePI (); + break; + + case '/': // Found "</" + isDirtyCurrentElement = false; + parseETag (); + return; + + default: // Found "<" followed by something else + isDirtyCurrentElement = false; + unread (c); + parseElement (false); + break; + } + } + } + + } + + + /** + * Parse an element type declaration. + * <pre> + * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' + * </pre> + * <p> NOTE: the '<!ELEMENT' has already been read. + */ + private void parseElementDecl () + throws Exception + { + String name; + + requireWhitespace (); + // Read the element type name. + name = readNmtoken (true); + + requireWhitespace (); + // Read the content model. + parseContentspec (name); + + skipWhitespace (); + require ('>'); + } + + + /** + * Content specification. + * <pre> + * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements + * </pre> + */ + private void parseContentspec (String name) + throws Exception + { +// FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... + if (tryRead ("EMPTY")) { + setElement (name, CONTENT_EMPTY, null, null); + if (!skippedPE) + handler.getDeclHandler ().elementDecl (name, "EMPTY"); + return; + } else if (tryRead ("ANY")) { + setElement (name, CONTENT_ANY, null, null); + if (!skippedPE) + handler.getDeclHandler ().elementDecl (name, "ANY"); + return; + } else { + String model; + char saved []; + + require ('('); + saved = readBuffer; + dataBufferAppend ('('); + skipWhitespace (); + if (tryRead ("#PCDATA")) { + dataBufferAppend ("#PCDATA"); + parseMixed (saved); + model = dataBufferToString (); + setElement (name, CONTENT_MIXED, model, null); + } else { + parseElements (saved); + model = dataBufferToString (); + setElement (name, CONTENT_ELEMENTS, model, null); + } + if (!skippedPE) + handler.getDeclHandler ().elementDecl (name, model); + } + } + + /** + * Parse an element-content model. + * <pre> + * [47] elements ::= (choice | seq) ('?' | '*' | '+')? + * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' + * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' + * </pre> + * + * <p> NOTE: the opening '(' and S have already been read. + * + * @param saved Buffer for entity that should have the terminal ')' + */ + private void parseElements (char saved []) + throws Exception + { + char c; + char sep; + + // Parse the first content particle + skipWhitespace (); + parseCp (); + + // Check for end or for a separator. + skipWhitespace (); + c = readCh (); + switch (c) { + case ')': + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Group/PE nesting"); + + dataBufferAppend (')'); + c = readCh (); + switch (c) { + case '*': + case '+': + case '?': + dataBufferAppend (c); + break; + default: + unread (c); + } + return; + case ',': // Register the separator. + case '|': + sep = c; + dataBufferAppend (c); + break; + default: + error ("bad separator in content model", c, null); + return; + } + + // Parse the rest of the content model. + while (true) { + skipWhitespace (); + parseCp (); + skipWhitespace (); + c = readCh (); + if (c == ')') { + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Group/PE nesting"); + + dataBufferAppend (')'); + break; + } else if (c != sep) { + error ("bad separator in content model", c, null); + return; + } else { + dataBufferAppend (c); + } + } + + // Check for the occurrence indicator. + c = readCh (); + switch (c) { + case '?': + case '*': + case '+': + dataBufferAppend (c); + return; + default: + unread (c); + return; + } + } + + + /** + * Parse a content particle. + * <pre> + * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? + * </pre> + */ + private void parseCp () + throws Exception + { + if (tryRead ('(')) { + dataBufferAppend ('('); + parseElements (readBuffer); + } else { + dataBufferAppend (readNmtoken (true)); + char c = readCh (); + switch (c) { + case '?': + case '*': + case '+': + dataBufferAppend (c); + break; + default: + unread (c); + break; + } + } + } + + + /** + * Parse mixed content. + * <pre> + * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' + * | '(' S? ('#PCDATA') S? ')' + * </pre> + * + * @param saved Buffer for entity that should have the terminal ')' + */ + private void parseMixed (char saved []) + throws Exception + { + // Check for PCDATA alone. + skipWhitespace (); + if (tryRead (')')) { + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Group/PE nesting"); + + dataBufferAppend (")*"); + tryRead ('*'); + return; + } + + // Parse mixed content. + skipWhitespace (); + while (!tryRead (")")) { + require ('|'); + dataBufferAppend ('|'); + skipWhitespace (); + dataBufferAppend (readNmtoken (true)); + skipWhitespace (); + } + + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Group/PE nesting"); + + require ('*'); + dataBufferAppend (")*"); + } + + + /** + * Parse an attribute list declaration. + * <pre> + * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' + * </pre> + * <p>NOTE: the '<!ATTLIST' has already been read. + */ + private void parseAttlistDecl () + throws Exception + { + String elementName; + + requireWhitespace (); + elementName = readNmtoken (true); + boolean white = tryWhitespace (); + while (!tryRead ('>')) { + if (!white) + error ("whitespace required before attribute definition"); + parseAttDef (elementName); + white = tryWhitespace (); + } + } + + + /** + * Parse a single attribute definition. + * <pre> + * [53] AttDef ::= S Name S AttType S DefaultDecl + * </pre> + */ + private void parseAttDef (String elementName) + throws Exception + { + String name; + String type; + String enumer = null; + + // Read the attribute name. + name = readNmtoken (true); + + // Read the attribute type. + requireWhitespace (); + type = readAttType (); + + // Get the string of enumerated values if necessary. + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if ("ENUMERATION" == type || "NOTATION" == type) + enumer = dataBufferToString (); + } else { + if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) + enumer = dataBufferToString (); + } + + // Read the default value. + requireWhitespace (); + parseDefault (elementName, name, type, enumer); + } + + + /** + * Parse the attribute type. + * <pre> + * [54] AttType ::= StringType | TokenizedType | EnumeratedType + * [55] StringType ::= 'CDATA' + * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' + * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' + * [57] EnumeratedType ::= NotationType | Enumeration + * </pre> + */ + private String readAttType () + throws Exception + { + if (tryRead ('(')) { + parseEnumeration (false); + return "ENUMERATION"; + } else { + String typeString = readNmtoken (true); + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if ("NOTATION" == typeString) { + parseNotationType (); + return typeString; + } else if ("CDATA" == typeString + || "ID" == typeString + || "IDREF" == typeString + || "IDREFS" == typeString + || "ENTITY" == typeString + || "ENTITIES" == typeString + || "NMTOKEN" == typeString + || "NMTOKENS" == typeString) + return typeString; + } else { + if ("NOTATION".equals(typeString)) { + parseNotationType (); + return typeString; + } else if ("CDATA".equals(typeString) + || "ID".equals(typeString) + || "IDREF".equals(typeString) + || "IDREFS".equals(typeString) + || "ENTITY".equals(typeString) + || "ENTITIES".equals(typeString) + || "NMTOKEN".equals(typeString) + || "NMTOKENS".equals(typeString)) + return typeString; + } + error ("illegal attribute type", typeString, null); + return null; + } + } + + + /** + * Parse an enumeration. + * <pre> + * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' + * </pre> + * <p>NOTE: the '(' has already been read. + */ + private void parseEnumeration (boolean isNames) + throws Exception + { + dataBufferAppend ('('); + + // Read the first token. + skipWhitespace (); + dataBufferAppend (readNmtoken (isNames)); + // Read the remaining tokens. + skipWhitespace (); + while (!tryRead (')')) { + require ('|'); + dataBufferAppend ('|'); + skipWhitespace (); + dataBufferAppend (readNmtoken (isNames)); + skipWhitespace (); + } + dataBufferAppend (')'); + } + + + /** + * Parse a notation type for an attribute. + * <pre> + * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks + * (S? '|' S? name)* S? ')' + * </pre> + * <p>NOTE: the 'NOTATION' has already been read + */ + private void parseNotationType () + throws Exception + { + requireWhitespace (); + require ('('); + + parseEnumeration (true); + } + + + /** + * Parse the default value for an attribute. + * <pre> + * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' + * | (('#FIXED' S)? AttValue) + * </pre> + */ + private void parseDefault ( + String elementName, + String name, + String type, + String enumer + ) throws Exception + { + int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; + String value = null; + int flags = LIT_ATTRIBUTE; + boolean saved = expandPE; + String defaultType = null; + + // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace + // chars to spaces (doesn't matter when that's done if it doesn't + // interfere with char refs expanding to whitespace). + + if (!skippedPE) { + flags |= LIT_ENTITY_REF; + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if ("CDATA" != type) + flags |= LIT_NORMALIZE; + } else { + if (!"CDATA".equals(type)) + flags |= LIT_NORMALIZE; + } + } + + expandPE = false; + if (tryRead ('#')) { + if (tryRead ("FIXED")) { + defaultType = "#FIXED"; + valueType = ATTRIBUTE_DEFAULT_FIXED; + requireWhitespace (); + value = readLiteral (flags); + } else if (tryRead ("REQUIRED")) { + defaultType = "#REQUIRED"; + valueType = ATTRIBUTE_DEFAULT_REQUIRED; + } else if (tryRead ("IMPLIED")) { + defaultType = "#IMPLIED"; + valueType = ATTRIBUTE_DEFAULT_IMPLIED; + } else { + error ("illegal keyword for attribute default value"); + } + } else + value = readLiteral (flags); + expandPE = saved; + setAttribute (elementName, name, type, enumer, value, valueType); + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if ("ENUMERATION" == type) + type = enumer; + else if ("NOTATION" == type) + type = "NOTATION " + enumer; + } else { + if ("ENUMERATION".equals(type)) + type = enumer; + else if ("NOTATION".equals(type)) + type = "NOTATION " + enumer; + } + if (!skippedPE) handler.getDeclHandler () + .attributeDecl (elementName, name, type, defaultType, value); + } + + + /** + * Parse a conditional section. + * <pre> + * [61] conditionalSect ::= includeSect || ignoreSect + * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' + * extSubsetDecl ']]>' + * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' + * ignoreSectContents* ']]>' + * [64] ignoreSectContents ::= Ignore + * ('<![' ignoreSectContents* ']]>' Ignore )* + * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) + * </pre> + * <p> NOTE: the '>![' has already been read. + */ + private void parseConditionalSect (char saved []) + throws Exception + { + skipWhitespace (); + if (tryRead ("INCLUDE")) { + skipWhitespace (); + require ('['); + // VC: Proper Conditional Section/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Conditional Section/PE nesting"); + skipWhitespace (); + while (!tryRead ("]]>")) { + parseMarkupdecl (); + skipWhitespace (); + } + } else if (tryRead ("IGNORE")) { + skipWhitespace (); + require ('['); + // VC: Proper Conditional Section/PE Nesting + if (readBuffer != saved) + handler.verror ("Illegal Conditional Section/PE nesting"); + int nesting = 1; + char c; + expandPE = false; + for (int nest = 1; nest > 0;) { + c = readCh (); + switch (c) { + case '<': + if (tryRead ("![")) { + nest++; + } + case ']': + if (tryRead ("]>")) { + nest--; + } + } + } + expandPE = true; + } else { + error ("conditional section must begin with INCLUDE or IGNORE"); + } + } + + private void parseCharRef () + throws SAXException, IOException + { + parseCharRef (true /* do flushDataBuffer by default */); + } + + /** + * Try to read a character reference without consuming data from buffer. + * <pre> + * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * </pre> + * <p>NOTE: the '&#' has already been read. + */ + private void tryReadCharRef () + throws SAXException, IOException + { + int value = 0; + char c; + + if (tryRead ('x')) { +loop1: + while (true) { + c = readCh (); + int n; + switch (c) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = c - '0'; + break; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + n = (c - 'a') + 10; + break; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + n = (c - 'A') + 10; + break; + case ';': + break loop1; + default: + error ("illegal character in character reference", c, null); + break loop1; + } + value *= 16; + value += n; + } + } else { +loop2: + while (true) { + c = readCh (); + switch (c) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + value *= 10; + value += c - '0'; + break; + case ';': + break loop2; + default: + error ("illegal character in character reference", c, null); + break loop2; + } + } + } + + // check for character refs being legal XML + if ((value < 0x0020 + && ! (value == '\n' || value == '\t' || value == '\r')) + || (value >= 0xD800 && value <= 0xDFFF) + || value == 0xFFFE || value == 0xFFFF + || value > 0x0010ffff) + error ("illegal XML character reference U+" + + Integer.toHexString (value)); + + // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz + // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: + if (value > 0x0010ffff) { + // too big for surrogate + error ("character reference " + value + " is too large for UTF-16", + new Integer (value).toString (), null); + } + + } + + /** + * Read and interpret a character reference. + * <pre> + * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * </pre> + * <p>NOTE: the '&#' has already been read. + */ + private void parseCharRef (boolean doFlush) + throws SAXException, IOException + { + int value = 0; + char c; + + if (tryRead ('x')) { +loop1: + while (true) { + c = readCh (); + int n; + switch (c) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = c - '0'; + break; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + n = (c - 'a') + 10; + break; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + n = (c - 'A') + 10; + break; + case ';': + break loop1; + default: + error ("illegal character in character reference", c, null); + break loop1; + } + value *= 16; + value += n; + } + } else { +loop2: + while (true) { + c = readCh (); + switch (c) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + value *= 10; + value += c - '0'; + break; + case ';': + break loop2; + default: + error ("illegal character in character reference", c, null); + break loop2; + } + } + } + + // check for character refs being legal XML + if ((value < 0x0020 + && ! (value == '\n' || value == '\t' || value == '\r')) + || (value >= 0xD800 && value <= 0xDFFF) + || value == 0xFFFE || value == 0xFFFF + || value > 0x0010ffff) + error ("illegal XML character reference U+" + + Integer.toHexString (value)); + + // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz + // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: + if (value <= 0x0000ffff) { + // no surrogates needed + dataBufferAppend ((char) value); + } else if (value <= 0x0010ffff) { + value -= 0x10000; + // > 16 bits, surrogate needed + dataBufferAppend ((char) (0xd800 | (value >> 10))); + dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff))); + } else { + // too big for surrogate + error ("character reference " + value + " is too large for UTF-16", + new Integer (value).toString (), null); + } + if (doFlush) dataBufferFlush (); + } + + + /** + * Parse and expand an entity reference. + * <pre> + * [68] EntityRef ::= '&' Name ';' + * </pre> + * <p>NOTE: the '&' has already been read. + * @param externalAllowed External entities are allowed here. + */ + private void parseEntityRef (boolean externalAllowed) + throws SAXException, IOException + { + String name; + + name = readNmtoken (true); + require (';'); + switch (getEntityType (name)) { + case ENTITY_UNDECLARED: + // NOTE: XML REC describes amazingly convoluted handling for + // this case. Nothing as meaningful as being a WFness error + // unless the processor might _legitimately_ not have seen a + // declaration ... which is what this implements. + String message; + + message = "reference to undeclared general entity " + name; + if (skippedPE && !docIsStandalone) { + handler.verror (message); + // we don't know this entity, and it might be external... + if (externalAllowed) + handler.skippedEntity (name); + } else + error (message); + break; + case ENTITY_INTERNAL: + pushString (name, getEntityValue (name)); + + //workaround for possible input pop before marking + //the buffer reading position + char t = readCh (); + unread (t); + int bufferPosMark = readBufferPos; + + int end = readBufferPos + getEntityValue (name).length(); + for(int k = readBufferPos; k < end; k++){ + t = readCh (); + if (t == '&'){ + t = readCh (); + if (t == '#'){ + //try to match a character ref + tryReadCharRef (); + + //everything has been read + if (readBufferPos >= end) + break; + k = readBufferPos; + continue; + } + else if (Character.isLetter(t)){ + //looks like an entity ref + unread (t); + readNmtoken (true); + require (';'); + + //everything has been read + if (readBufferPos >= end) + break; + k = readBufferPos; + continue; + } + error(" malformed entity reference"); + } + + } + readBufferPos = bufferPosMark; + break; + case ENTITY_TEXT: + if (externalAllowed) { + pushURL (false, name, getEntityIds (name), + null, null, null, true); + } else { + error ("reference to external entity in attribute value.", + name, null); + } + break; + case ENTITY_NDATA: + if (externalAllowed) { + error ("unparsed entity reference in content", name, null); + } else { + error ("reference to external entity in attribute value.", + name, null); + } + break; + default: + throw new RuntimeException (); + } + } + + + /** + * Parse and expand a parameter entity reference. + * <pre> + * [69] PEReference ::= '%' Name ';' + * </pre> + * <p>NOTE: the '%' has already been read. + */ + private void parsePEReference () + throws SAXException, IOException + { + String name; + + name = "%" + readNmtoken (true); + require (';'); + switch (getEntityType (name)) { + case ENTITY_UNDECLARED: + // VC: Entity Declared + handler.verror ("reference to undeclared parameter entity " + name); + + // we should disable handling of all subsequent declarations + // unless this is a standalone document (info discarded) + break; + case ENTITY_INTERNAL: + if (inLiteral) + pushString (name, getEntityValue (name)); + else + pushString (name, ' ' + getEntityValue (name) + ' '); + break; + case ENTITY_TEXT: + if (!inLiteral) + pushString (null, " "); + pushURL (true, name, getEntityIds (name), null, null, null, true); + if (!inLiteral) + pushString (null, " "); + break; + } + } + + /** + * Parse an entity declaration. + * <pre> + * [70] EntityDecl ::= GEDecl | PEDecl + * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' + * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' + * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) + * [74] PEDef ::= EntityValue | ExternalID + * [75] ExternalID ::= 'SYSTEM' S SystemLiteral + * | 'PUBLIC' S PubidLiteral S SystemLiteral + * [76] NDataDecl ::= S 'NDATA' S Name + * </pre> + * <p>NOTE: the '<!ENTITY' has already been read. + */ + private void parseEntityDecl () + throws Exception + { + boolean peFlag = false; + int flags = 0; + + // Check for a parameter entity. + expandPE = false; + requireWhitespace (); + if (tryRead ('%')) { + peFlag = true; + requireWhitespace (); + } + expandPE = true; + + // Read the entity name, and prepend + // '%' if necessary. + String name = readNmtoken (true); + //NE08 + if (name.indexOf(':') >= 0) + error ("Illegal character(':') in entity name ", name, null); + if (peFlag) { + name = "%" + name; + } + + // Read the entity value. + requireWhitespace (); + char c = readCh (); + unread (c); + if (c == '"' || c == '\'') { + // Internal entity ... replacement text has expanded refs + // to characters and PEs, but not to general entities + String value = readLiteral (flags); + setInternalEntity (name, value); + } else { + // Read the external IDs + String ids [] = readExternalIds (false, false); + + // Check for NDATA declaration. + boolean white = tryWhitespace (); + if (!peFlag && tryRead ("NDATA")) { + if (!white) + error ("whitespace required before NDATA"); + requireWhitespace (); + String notationName = readNmtoken (true); + if (!skippedPE) { + setExternalEntity (name, ENTITY_NDATA, ids, notationName); + handler.unparsedEntityDecl (name, ids, notationName); + } + } else if (!skippedPE) { + setExternalEntity (name, ENTITY_TEXT, ids, null); + handler.getDeclHandler () + .externalEntityDecl (name, ids [0], + handler.resolveURIs () + // FIXME: ASSUMES not skipped + // "false" forces error on bad URI + ? handler.absolutize (ids [2], ids [1], false) + : ids [1]); + } + } + + // Finish the declaration. + skipWhitespace (); + require ('>'); + } + + + /** + * Parse a notation declaration. + * <pre> + * [82] NotationDecl ::= '<!NOTATION' S Name S + * (ExternalID | PublicID) S? '>' + * [83] PublicID ::= 'PUBLIC' S PubidLiteral + * </pre> + * <P>NOTE: the '<!NOTATION' has already been read. + */ + private void parseNotationDecl () + throws Exception + { + String nname, ids[]; + + + requireWhitespace (); + nname = readNmtoken (true); + //NE08 + if (nname.indexOf(':') >= 0) + error ("Illegal character(':') in notation name ", nname, null); + requireWhitespace (); + + // Read the external identifiers. + ids = readExternalIds (true, false); + + // Register the notation. + setNotation (nname, ids); + + skipWhitespace (); + require ('>'); + } + + + /** + * Parse character data. + * <pre> + * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + * </pre> + */ + private void parseCharData () + throws Exception + { + char c; + int state = 0; + boolean pureWhite = false; + + // assert (dataBufferPos == 0); + + // are we expecting pure whitespace? it might be dirty... + if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) + pureWhite = true; + + // always report right out of readBuffer + // to minimize (pointless) buffer copies + while (true) { + int lineAugment = 0; + int columnAugment = 0; + int i; + +loop: + for (i = readBufferPos; i < readBufferLength; i++) { + switch (c = readBuffer [i]) { + case '\n': + lineAugment++; + columnAugment = 0; + // pureWhite unmodified + break; + case '\r': // should not happen!! + case '\t': + case ' ': + // pureWhite unmodified + columnAugment++; + break; + case '&': + case '<': + columnAugment++; + // pureWhite unmodified + // CLEAN end of text sequence + state = 1; + break loop; + case ']': + // that's not a whitespace char, and + // can not terminate pure whitespace either + pureWhite = false; + if ((i + 2) < readBufferLength) { + if (readBuffer [i + 1] == ']' + && readBuffer [i + 2] == '>') { + // ERROR end of text sequence + state = 2; + break loop; + } + } else { + // FIXME missing two end-of-buffer cases + } + columnAugment++; + break; + default: + if ((c < 0x0020 || c > 0xFFFD) + || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) + && xmlVersion == XML_11)) + error ("illegal XML character U+" + + Integer.toHexString (c)); + // that's not a whitespace char + pureWhite = false; + columnAugment++; + } + } + + // report text thus far + if (lineAugment > 0) { + line += lineAugment; + column = columnAugment; + } else { + column += columnAugment; + } + + // report characters/whitspace + int length = i - readBufferPos; + + if (length != 0) { + if (pureWhite) + handler.ignorableWhitespace (readBuffer, + readBufferPos, length); + else + handler.charData (readBuffer, readBufferPos, length); + readBufferPos = i; + } + + if (state != 0) + break; + + // fill next buffer from this entity, or + // pop stack and continue with previous entity + unread (readCh ()); + } + if (!pureWhite) + isDirtyCurrentElement = true; + // finish, maybe with error + if (state != 1) // finish, no error + error ("character data may not contain ']]>'"); + } + + + ////////////////////////////////////////////////////////////////////// + // High-level reading and scanning methods. + ////////////////////////////////////////////////////////////////////// + + /** + * Require whitespace characters. + */ + private void requireWhitespace () + throws SAXException, IOException + { + char c = readCh (); + if (isWhitespace (c)) { + skipWhitespace (); + } else { + error ("whitespace required", c, null); + } + } + + + /** + * Skip whitespace characters. + * <pre> + * [3] S ::= (#x20 | #x9 | #xd | #xa)+ + * </pre> + */ + private void skipWhitespace () + throws SAXException, IOException + { + // Start with a little cheat. Most of + // the time, the white space will fall + // within the current read buffer; if + // not, then fall through. + if (USE_CHEATS) { + int lineAugment = 0; + int columnAugment = 0; + +loop: + for (int i = readBufferPos; i < readBufferLength; i++) { + switch (readBuffer [i]) { + case ' ': + case '\t': + case '\r': + columnAugment++; + break; + case '\n': + lineAugment++; + columnAugment = 0; + break; + case '%': + if (expandPE) + break loop; + // else fall through... + default: + readBufferPos = i; + if (lineAugment > 0) { + line += lineAugment; + column = columnAugment; + } else { + column += columnAugment; + } + return; + } + } + } + + // OK, do it the slow way. + char c = readCh (); + while (isWhitespace (c)) { + c = readCh (); + } + unread (c); + } + + + /** + * Read a name or (when parsing an enumeration) name token. + * <pre> + * [5] Name ::= (Letter | '_' | ':') (NameChar)* + * [7] Nmtoken ::= (NameChar)+ + * </pre> + */ + private String readNmtoken (boolean isName) + throws SAXException, IOException + { + char c; + + if (USE_CHEATS) { +loop: + for (int i = readBufferPos; i < readBufferLength; i++) { + c = readBuffer [i]; + switch (c) { + case '%': + if (expandPE) + break loop; + // else fall through... + + // What may legitimately come AFTER a name/nmtoken? + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\r': case '\n': + case ';': + case '/': + int start = readBufferPos; + if (i == start) + error ("name expected", readBuffer [i], null); + readBufferPos = i; + return intern (readBuffer, start, i - start); + + default: +// FIXME ... per IBM's OASIS test submission, these: +// ? U+06dd +// Combining U+309B + //these switches are kind of ugly but at least we won't + //have to go over the whole lits for each char + if (isName && i == readBufferPos){ + char c2 = (char) (c & 0x00f0); + switch (c & 0xff00){ + //starting with 01 + case 0x0100: + switch (c2){ + case 0x0030: + if (c == 0x0132 || c == 0x0133 || c == 0x013f) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x0040: + if (c == 0x0140 || c == 0x0149) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x00c0: + if (c == 0x01c4 || c == 0x01cc) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x00f0: + if (c == 0x01f1 || c == 0x01f3) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x00b0: + if (c == 0x01f1 || c == 0x01f3) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + default: + if (c == 0x017f) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + } + + break; + //starting with 11 + case 0x1100: + switch (c2){ + case 0x0000: + if (c == 0x1104 || c == 0x1108 || + c == 0x110a || c == 0x110d) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x0030: + if (c == 0x113b || c == 0x113f) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x0040: + if (c == 0x1141 || c == 0x114d + || c == 0x114f ) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x0050: + if (c == 0x1151 || c == 0x1156) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x0060: + if (c == 0x1162 || c == 0x1164 + || c == 0x1166 || c == 0x116b + || c == 0x116f) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + case 0x00b0: + if (c == 0x11b6 || c == 0x11b9 + || c == 0x11bb || c == 0x116f) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + break; + default: + if (c == 0x1174 || c == 0x119f + || c == 0x11ac || c == 0x11c3 + || c == 0x11f1) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + } + break; + default: + if (c == 0x0e46 || c == 0x1011 + || c == 0x212f || c == 0x0587 + || c == 0x0230 ) + error ("Not a name start character, U+" + + Integer.toHexString (c)); + } + } + // punt on exact tests from Appendix A; approximate + // them using the Unicode ID start/part rules + if (i == readBufferPos && isName) { + if (!Character.isUnicodeIdentifierStart (c) + && c != ':' && c != '_') + error ("Not a name start character, U+" + + Integer.toHexString (c)); + } else if (!Character.isUnicodeIdentifierPart (c) + && c != '-' && c != ':' && c != '_' && c != '.' + && !isExtender (c)) + error ("Not a name character, U+" + + Integer.toHexString (c)); + } + } + } + + nameBufferPos = 0; + + // Read the first character. +loop: + while (true) { + c = readCh (); + switch (c) { + case '%': + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\n': case '\r': + case ';': + case '/': + unread (c); + if (nameBufferPos == 0) { + error ("name expected"); + } + // punt on exact tests from Appendix A, but approximate them + if (isName + && !Character.isUnicodeIdentifierStart ( + nameBuffer [0]) + && ":_".indexOf (nameBuffer [0]) == -1) + error ("Not a name start character, U+" + + Integer.toHexString (nameBuffer [0])); + String s = intern (nameBuffer, 0, nameBufferPos); + nameBufferPos = 0; + return s; + default: + // punt on exact tests from Appendix A, but approximate them + + if ((nameBufferPos != 0 || !isName) + && !Character.isUnicodeIdentifierPart (c) + && ":-_.".indexOf (c) == -1 + && !isExtender (c)) + error ("Not a name character, U+" + + Integer.toHexString (c)); + if (nameBufferPos >= nameBuffer.length) + nameBuffer = + (char[]) extendArray (nameBuffer, + nameBuffer.length, nameBufferPos); + nameBuffer [nameBufferPos++] = c; + } + } + } + + private static boolean isExtender (char c) + { + // [88] Extender ::= ... + return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 + || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 + || (c >= 0x3031 && c <= 0x3035) + || (c >= 0x309d && c <= 0x309e) + || (c >= 0x30fc && c <= 0x30fe); + } + + + /** + * Read a literal. With matching single or double quotes as + * delimiters (and not embedded!) this is used to parse: + * <pre> + * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... + * [10] AttValue ::= ... ([^<&] | Reference)* ... + * [11] SystemLiteral ::= ... (URLchar - "'")* ... + * [12] PubidLiteral ::= ... (PubidChar - "'")* ... + * </pre> + * as well as the quoted strings in XML and text declarations + * (for version, encoding, and standalone) which have their + * own constraints. + */ + private String readLiteral (int flags) + throws SAXException, IOException + { + char delim, c; + int startLine = line; + boolean saved = expandPE; + boolean savedReport = doReport; + + // Find the first delimiter. + delim = readCh (); + if (delim != '"' && delim != '\'') { + error ("expected '\"' or \"'\"", delim, null); + return null; + } + inLiteral = true; + if ((flags & LIT_DISABLE_PE) != 0) + expandPE = false; + doReport = false; + + // Each level of input source has its own buffer; remember + // ours, so we won't read the ending delimiter from any + // other input source, regardless of entity processing. + char ourBuf [] = readBuffer; + + // Read the literal. + try { + c = readCh (); + boolean ampRead = false; +loop: + while (! (c == delim && readBuffer == ourBuf)) { + switch (c) { + // attributes and public ids are normalized + // in almost the same ways + case '\n': + case '\r': + if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) + c = ' '; + break; + case '\t': + if ((flags & LIT_ATTRIBUTE) != 0) + c = ' '; + break; + case '&': + c = readCh (); + // Char refs are expanded immediately, except for + // all the cases where it's deferred. + if (c == '#') { + if ((flags & LIT_DISABLE_CREF) != 0) { + dataBufferAppend ('&'); + break; + } + parseCharRef (false /* Do not do flushDataBuffer */); + + // exotic WFness risk: this is an entity literal, + // dataBuffer [dataBufferPos - 1] == '&', and + // following chars are a _partial_ entity/char ref + + // It looks like an entity ref ... + } else { + unread (c); + // Expand it? + if ((flags & LIT_ENTITY_REF) > 0) { + parseEntityRef (false); + if (String.valueOf (readBuffer).equals("&")) + ampRead = true; + //Is it just data? + } else if ((flags & LIT_DISABLE_EREF) != 0) { + dataBufferAppend ('&'); + + // OK, it will be an entity ref -- expanded later. + } else { + String name = readNmtoken (true); + require (';'); + dataBufferAppend ('&'); + dataBufferAppend (name); + dataBufferAppend (';'); + } + } + c = readCh (); + continue loop; + + case '<': + // and why? Perhaps so "&foo;" expands the same + // inside and outside an attribute? + if ((flags & LIT_ATTRIBUTE) != 0) + error ("attribute values may not contain '<'"); + break; + + // We don't worry about case '%' and PE refs, readCh does. + + default: + break; + } + dataBufferAppend (c); + c = readCh (); + } + } catch (EOFException e) { + error ("end of input while looking for delimiter (started on line " + + startLine + ')', null, new Character (delim).toString ()); + } + inLiteral = false; + expandPE = saved; + doReport = savedReport; + + // Normalise whitespace if necessary. + if ((flags & LIT_NORMALIZE) > 0) { + dataBufferNormalize (); + } + + // Return the value. + return dataBufferToString (); + } + + + /** + * Try reading external identifiers. + * A system identifier is not required for notations. + * @param inNotation Are we parsing a notation decl? + * @param isSubset Parsing external subset decl (may be omitted)? + * @return A three-member String array containing the identifiers, + * or nulls. Order: public, system, baseURI. + */ + private String[] readExternalIds (boolean inNotation, boolean isSubset) + throws Exception + { + char c; + String ids[] = new String [3]; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + if (tryRead ("PUBLIC")) { + requireWhitespace (); + ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags); + if (inNotation) { + skipWhitespace (); + c = readCh (); + unread (c); + if (c == '"' || c == '\'') { + ids [1] = readLiteral (flags); + } + } else { + requireWhitespace (); + ids [1] = readLiteral (flags); + } + + for (int i = 0; i < ids [0].length (); i++) { + c = ids [0].charAt (i); + if (c >= 'a' && c <= 'z') + continue; + if (c >= 'A' && c <= 'Z') + continue; + if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1) + continue; + error ("illegal PUBLIC id character U+" + + Integer.toHexString (c)); + } + } else if (tryRead ("SYSTEM")) { + requireWhitespace (); + ids [1] = readLiteral (flags); + } else if (!isSubset) + error ("missing SYSTEM or PUBLIC keyword"); + + if (ids [1] != null) { + if (ids [1].indexOf ('#') != -1) + handler.verror ("SYSTEM id has a URI fragment: " + ids [1]); + ids [2] = handler.getSystemId (); + if (ids [2] == null) + handler.warn ("No base URI; hope URI is absolute: " + + ids [1]); + } + + return ids; + } + + + /** + * Test if a character is whitespace. + * <pre> + * [3] S ::= (#x20 | #x9 | #xd | #xa)+ + * </pre> + * @param c The character to test. + * @return true if the character is whitespace. + */ + private final boolean isWhitespace (char c) + { + if (c > 0x20) + return false; + if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) + return true; + return false; // illegal ... + } + + + ////////////////////////////////////////////////////////////////////// + // Utility routines. + ////////////////////////////////////////////////////////////////////// + + + /** + * Add a character to the data buffer. + */ + private void dataBufferAppend (char c) + { + // Expand buffer if necessary. + if (dataBufferPos >= dataBuffer.length) + dataBuffer = + (char[]) extendArray (dataBuffer, + dataBuffer.length, dataBufferPos); + dataBuffer [dataBufferPos++] = c; + } + + + /** + * Add a string to the data buffer. + */ + private void dataBufferAppend (String s) + { + dataBufferAppend (s.toCharArray (), 0, s.length ()); + } + + + /** + * Append (part of) a character array to the data buffer. + */ + private void dataBufferAppend (char ch[], int start, int length) + { + dataBuffer = (char[]) + extendArray (dataBuffer, dataBuffer.length, + dataBufferPos + length); + + System.arraycopy (ch, start, dataBuffer, dataBufferPos, length); + dataBufferPos += length; + } + + + /** + * Normalise space characters in the data buffer. + */ + private void dataBufferNormalize () + { + int i = 0; + int j = 0; + int end = dataBufferPos; + + // Skip spaces at the start. + while (j < end && dataBuffer [j] == ' ') { + j++; + } + + // Skip whitespace at the end. + while (end > j && dataBuffer [end - 1] == ' ') { + end --; + } + + // Start copying to the left. + while (j < end) { + + char c = dataBuffer [j++]; + + // Normalise all other spaces to + // a single space. + if (c == ' ') { + while (j < end && dataBuffer [j++] == ' ') + continue; + dataBuffer [i++] = ' '; + dataBuffer [i++] = dataBuffer [j - 1]; + } else { + dataBuffer [i++] = c; + } + } + + // The new length is <= the old one. + dataBufferPos = i; + } + + + /** + * Convert the data buffer to a string. + */ + private String dataBufferToString () + { + String s = new String (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + return s; + } + + + /** + * Flush the contents of the data buffer to the handler, as + * appropriate, and reset the buffer for new input. + */ + private void dataBufferFlush () + throws SAXException + { + if (currentElementContent == CONTENT_ELEMENTS + && dataBufferPos > 0 + && !inCDATA + ) { + // We can't just trust the buffer to be whitespace, there + // are (error) cases when it isn't + for (int i = 0; i < dataBufferPos; i++) { + if (!isWhitespace (dataBuffer [i])) { + handler.charData (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + if (dataBufferPos > 0) { + handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } else if (dataBufferPos > 0) { + handler.charData (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + + + /** + * Require a string to appear, or throw an exception. + * <p><em>Precondition:</em> Entity expansion is not required. + * <p><em>Precondition:</em> data buffer has no characters that + * will get sent to the application. + */ + private void require (String delim) + throws SAXException, IOException + { + int length = delim.length (); + char ch []; + + if (length < dataBuffer.length) { + ch = dataBuffer; + delim.getChars (0, length, ch, 0); + } else + ch = delim.toCharArray (); + + if (USE_CHEATS + && length <= (readBufferLength - readBufferPos)) { + int offset = readBufferPos; + + for (int i = 0; i < length; i++, offset++) + if (ch [i] != readBuffer [offset]) + error ("required string", null, delim); + readBufferPos = offset; + + } else { + for (int i = 0; i < length; i++) + require (ch [i]); + } + } + + + /** + * Require a character to appear, or throw an exception. + */ + private void require (char delim) + throws SAXException, IOException + { + char c = readCh (); + + if (c != delim) { + error ("required character", c, new Character (delim).toString ()); + } + } + + + /** + * Create an interned string from a character array. + * Ælfred uses this method to create an interned version + * of all names and name tokens, so that it can test equality + * with <code>==</code> instead of <code>String.equals ()</code>. + * + * <p>This is much more efficient than constructing a non-interned + * string first, and then interning it. + * + * @param ch an array of characters for building the string. + * @param start the starting position in the array. + * @param length the number of characters to place in the string. + * @return an interned string. + * @see #intern (String) + * @see java.lang.String#intern + */ + public String intern (char ch[], int start, int length) + { + int index = 0; + int hash = 0; + Object bucket []; + + // Generate a hash code. This is a widely used string hash, + // often attributed to Brian Kernighan. + for (int i = start; i < start + length; i++) + hash = 31 * hash + ch [i]; + hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; + + // Get the bucket -- consists of {array,String} pairs + if ((bucket = symbolTable [hash]) == null) { + // first string in this bucket + bucket = new Object [8]; + + // Search for a matching tuple, and + // return the string if we find one. + } else { + while (index < bucket.length) { + char chFound [] = (char []) bucket [index]; + + // Stop when we hit an empty entry. + if (chFound == null) + break; + + // If they're the same length, check for a match. + if (chFound.length == length) { + for (int i = 0; i < chFound.length; i++) { + // continue search on failure + if (ch [start + i] != chFound [i]) { + break; + } else if (i == length - 1) { + // That's it, we have a match! + return (String) bucket [index + 1]; + } + } + } + index += 2; + } + // Not found -- we'll have to add it. + + // Do we have to grow the bucket? + bucket = (Object []) extendArray (bucket, bucket.length, index); + } + symbolTable [hash] = bucket; + + // OK, add it to the end of the bucket -- "local" interning. + // Intern "globally" to let applications share interning benefits. + // That is, "!=" and "==" work on our strings, not just equals(). + String s = new String (ch, start, length).intern (); + bucket [index] = s.toCharArray (); + bucket [index + 1] = s; + return s; + } + + /** + * Ensure the capacity of an array, allocating a new one if + * necessary. Usually extends only for name hash collisions. + */ + private Object extendArray (Object array, int currentSize, int requiredSize) + { + if (requiredSize < currentSize) { + return array; + } else { + Object newArray = null; + int newSize = currentSize * 2; + + if (newSize <= requiredSize) + newSize = requiredSize + 1; + + if (array instanceof char[]) + newArray = new char [newSize]; + else if (array instanceof Object[]) + newArray = new Object [newSize]; + else + throw new RuntimeException (); + + System.arraycopy (array, 0, newArray, 0, currentSize); + return newArray; + } + } + + + ////////////////////////////////////////////////////////////////////// + // XML query routines. + ////////////////////////////////////////////////////////////////////// + + + boolean isStandalone () { return docIsStandalone; } + + + // + // Elements + // + + private int getContentType (Object element [], int defaultType) + { + int retval; + + if (element == null) + return defaultType; + retval = ((Integer) element [0]).intValue (); + if (retval == CONTENT_UNDECLARED) + retval = defaultType; + return retval; + } + + + /** + * Look up the content type of an element. + * @param name The element type name. + * @return An integer constant representing the content type. + * @see #CONTENT_UNDECLARED + * @see #CONTENT_ANY + * @see #CONTENT_EMPTY + * @see #CONTENT_MIXED + * @see #CONTENT_ELEMENTS + */ + public int getElementContentType (String name) + { + Object element [] = (Object []) elementInfo.get (name); + return getContentType (element, CONTENT_UNDECLARED); + } + + + /** + * Register an element. + * Array format: + * [0] element type name + * [1] content model (mixed, elements only) + * [2] attribute hash table + */ + private void setElement ( + String name, + int contentType, + String contentModel, + Hashtable attributes + ) throws SAXException + { + if (skippedPE) + return; + + Object element [] = (Object []) elementInfo.get (name); + + // first <!ELEMENT ...> or <!ATTLIST ...> for this type? + if (element == null) { + element = new Object [3]; + element [0] = new Integer (contentType); + element [1] = contentModel; + element [2] = attributes; + elementInfo.put (name, element); + return; + } + + // <!ELEMENT ...> declaration? + if (contentType != CONTENT_UNDECLARED) { + // ... following an associated <!ATTLIST ...> + if (((Integer) element [0]).intValue () == CONTENT_UNDECLARED) { + element [0] = new Integer (contentType); + element [1] = contentModel; + } else + // VC: Unique Element Type Declaration + handler.verror ("multiple declarations for element type: " + + name); + } + + // first <!ATTLIST ...>, before <!ELEMENT ...> ? + else if (attributes != null) + element [2] = attributes; + } + + + /** + * Look up the attribute hash table for an element. + * The hash table is the second item in the element array. + */ + private Hashtable getElementAttributes (String name) + { + Object element[] = (Object[]) elementInfo.get (name); + if (element == null) + return null; + else + return (Hashtable) element [2]; + } + + + + // + // Attributes + // + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An Enumeration of all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + private Enumeration declaredAttributes (Object element []) + { + Hashtable attlist; + + if (element == null) + return null; + if ((attlist = (Hashtable) element [2]) == null) + return null; + return attlist.keys (); + } + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An Enumeration of all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + public Enumeration declaredAttributes (String elname) + { + return declaredAttributes ((Object []) elementInfo.get (elname)); + } + + + /** + * Retrieve the declared type of an attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return An interend string denoting the type, or null + * indicating an undeclared attribute. + */ + public String getAttributeType (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return null; + } else { + return (String) attribute [0]; + } + } + + + /** + * Retrieve the allowed values for an enumerated attribute type. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return A string containing the token list. + */ + public String getAttributeEnumeration (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return null; + } else { + // assert: attribute [0] is "ENUMERATION" or "NOTATION" + return (String) attribute [3]; + } + } + + + /** + * Retrieve the default value of a declared attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The default value, or null if the attribute was + * #IMPLIED or simply undeclared and unspecified. + * @see #getAttributeExpandedValue + */ + public String getAttributeDefaultValue (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return null; + } else { + return (String) attribute [1]; + } + } + + /* + +// FIXME: Leaving this in, until W3C finally resolves the confusion +// between parts of the XML 2nd REC about when entity declararations +// are guaranteed to be known. Current code matches what section 5.1 +// (conformance) describes, but some readings of the self-contradicting +// text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that +// attribute expansion/normalization must be deferred in some cases +// (just TRY to identify them!). + + * Retrieve the expanded value of a declared attribute. + * <p>General entities (and char refs) will be expanded (once). + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The expanded default value, or null if the attribute was + * #IMPLIED or simply undeclared + * @see #getAttributeDefaultValue + public String getAttributeExpandedValue (String name, String aname) + throws Exception + { + Object attribute[] = getAttribute (name, aname); + + if (attribute == null) { + return null; + } else if (attribute [4] == null && attribute [1] != null) { + // we MUST use the same buf for both quotes else the literal + // can't be properly terminated + char buf [] = new char [1]; + int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; + String type = getAttributeType (name, aname); + + if (type != "CDATA" && type != null) + flags |= LIT_NORMALIZE; + buf [0] = '"'; + pushCharArray (null, buf, 0, 1); + pushString (null, (String) attribute [1]); + pushCharArray (null, buf, 0, 1); + attribute [4] = readLiteral (flags); + } + return (String) attribute [4]; + } + */ + + /** + * Retrieve the default value mode of a declared attribute. + * @see #ATTRIBUTE_DEFAULT_SPECIFIED + * @see #ATTRIBUTE_DEFAULT_IMPLIED + * @see #ATTRIBUTE_DEFAULT_REQUIRED + * @see #ATTRIBUTE_DEFAULT_FIXED + */ + public int getAttributeDefaultValueType (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return ATTRIBUTE_DEFAULT_UNDECLARED; + } else { + return ((Integer) attribute [2]).intValue (); + } + } + + + /** + * Register an attribute declaration for later retrieval. + * Format: + * - String type + * - String default value + * - int value type + * - enumeration + * - processed default value + */ + private void setAttribute (String elName, String name, String type, + String enumeration, + String value, int valueType) + throws Exception + { + Hashtable attlist; + + if (skippedPE) + return; + + // Create a new hashtable if necessary. + attlist = getElementAttributes (elName); + if (attlist == null) + attlist = new Hashtable (); + + // ignore multiple attribute declarations! + if (attlist.get (name) != null) { + // warn ... + return; + } else { + Object attribute [] = new Object [5]; + attribute [0] = type; + attribute [1] = value; + attribute [2] = new Integer (valueType); + attribute [3] = enumeration; + attribute [4] = null; + attlist.put (name, attribute); + + // save; but don't overwrite any existing <!ELEMENT ...> + setElement (elName, CONTENT_UNDECLARED, null, attlist); + } + } + + + /** + * Retrieve the array representing an attribute declaration. + */ + private Object[] getAttribute (String elName, String name) + { + Hashtable attlist; + + attlist = getElementAttributes (elName); + if (attlist == null) + return null; + return (Object[]) attlist.get (name); + } + + + // + // Entities + // + + /** + * Find the type of an entity. + * @returns An integer constant representing the entity type. + * @see #ENTITY_UNDECLARED + * @see #ENTITY_INTERNAL + * @see #ENTITY_NDATA + * @see #ENTITY_TEXT + */ + public int getEntityType (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return ENTITY_UNDECLARED; + } else { + return ((Integer) entity [0]).intValue (); + } + } + + + /** + * Return an external entity's identifier array. + * @param ename The name of the external entity. + * @return Three element array containing (in order) the entity's + * public identifier, system identifier, and base URI. Null if + * the entity was not declared as an external entity. + * @see #getEntityType + */ + public String [] getEntityIds (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return null; + } else { + return (String []) entity [1]; + } + } + + + /** + * Return an internal entity's replacement text. + * @param ename The name of the internal entity. + * @return The entity's replacement text, or null if + * the entity was not declared as an internal entity. + * @see #getEntityType + */ + public String getEntityValue (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return null; + } else { + return (String) entity [3]; + } + } + + + /** + * Register an entity declaration for later retrieval. + */ + private void setInternalEntity (String eName, String value) + throws SAXException + { + if (skippedPE) + return; + + if (entityInfo.get (eName) == null) { + Object entity[] = new Object [5]; + entity [0] = new Integer (ENTITY_INTERNAL); +// FIXME: shrink!! [2] useless + entity [3] = value; + entityInfo.put (eName, entity); + } + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + if ("lt" == eName || "gt" == eName || "quot" == eName + || "apos" == eName || "amp" == eName) + return; + } else { + if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) + || "apos".equals(eName) || "amp".equals(eName)) + return; + } + handler.getDeclHandler () + .internalEntityDecl (eName, value); + } + + + /** + * Register an external entity declaration for later retrieval. + */ + private void setExternalEntity (String eName, int eClass, + String ids [], String nName) + { + if (entityInfo.get (eName) == null) { + Object entity[] = new Object [5]; + entity [0] = new Integer (eClass); + entity [1] = ids; +// FIXME: shrink!! [2] no longer used, [4] irrelevant given [0] + entity [4] = nName; + entityInfo.put (eName, entity); + } + } + + + // + // Notations. + // + + /** + * Report a notation declaration, checking for duplicates. + */ + private void setNotation (String nname, String ids []) + throws SAXException + { + if (skippedPE) + return; + + handler.notationDecl (nname, ids); + if (notationInfo.get (nname) == null) + notationInfo.put (nname, nname); + else + // VC: Unique Notation Name + handler.verror ("Duplicate notation name decl: " + nname); + } + + + // + // Location. + // + + + /** + * Return the current line number. + */ + public int getLineNumber () + { + return line; + } + + + /** + * Return the current column number. + */ + public int getColumnNumber () + { + return column; + } + + + ////////////////////////////////////////////////////////////////////// + // High-level I/O. + ////////////////////////////////////////////////////////////////////// + + + /** + * Read a single character from the readBuffer. + * <p>The readDataChunk () method maintains the buffer. + * <p>If we hit the end of an entity, try to pop the stack and + * keep going. + * <p> (This approach doesn't really enforce XML's rules about + * entity boundaries, but this is not currently a validating + * parser). + * <p>This routine also attempts to keep track of the current + * position in external entities, but it's not entirely accurate. + * @return The next available input character. + * @see #unread (char) + * @see #readDataChunk + * @see #readBuffer + * @see #line + * @return The next character from the current input source. + */ + private char readCh () + throws SAXException, IOException + { + // As long as there's nothing in the + // read buffer, try reading more data + // (for an external entity) or popping + // the entity stack (for either). + while (readBufferPos >= readBufferLength) { + switch (sourceType) { + case INPUT_READER: + case INPUT_STREAM: + readDataChunk (); + while (readBufferLength < 1) { + popInput (); + if (readBufferLength < 1) { + readDataChunk (); + } + } + break; + + default: + + popInput (); + break; + } + } + + char c = readBuffer [readBufferPos++]; + + if (c == '\n') { + line++; + column = 0; + } else { + if (c == '<') { + /* the most common return to parseContent () ... NOP */ + } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) + || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) + && xmlVersion == XML_11)) + error ("illegal XML character U+" + + Integer.toHexString (c)); + + // If we're in the DTD and in a context where PEs get expanded, + // do so ... 1/14/2000 errata identify those contexts. There + // are also spots in the internal subset where PE refs are fatal + // errors, hence yet another flag. + else if (c == '%' && expandPE) { + if (peIsError) + error ("PE reference within decl in internal subset."); + parsePEReference (); + return readCh (); + } + column++; + } + + return c; + } + + + /** + * Push a single character back onto the current input stream. + * <p>This method usually pushes the character back onto + * the readBuffer. + * <p>I don't think that this would ever be called with + * readBufferPos = 0, because the methods always reads a character + * before unreading it, but just in case, I've added a boundary + * condition. + * @param c The character to push back. + * @see #readCh + * @see #unread (char[]) + * @see #readBuffer + */ + private void unread (char c) + throws SAXException + { + // Normal condition. + if (c == '\n') { + line--; + column = -1; + } + if (readBufferPos > 0) { + readBuffer [--readBufferPos] = c; + } else { + pushString (null, new Character (c).toString ()); + } + } + + + /** + * Push a char array back onto the current input stream. + * <p>NOTE: you must <em>never</em> push back characters that you + * haven't actually read: use pushString () instead. + * @see #readCh + * @see #unread (char) + * @see #readBuffer + * @see #pushString + */ + private void unread (char ch[], int length) + throws SAXException + { + for (int i = 0; i < length; i++) { + if (ch [i] == '\n') { + line--; + column = -1; + } + } + if (length < readBufferPos) { + readBufferPos -= length; + } else { + pushCharArray (null, ch, 0, length); + } + } + + + /** + * Push, or skip, a new external input source. + * The source will be some kind of parsed entity, such as a PE + * (including the external DTD subset) or content for the body. + * + * @param url The java.net.URL object for the entity. + * @see SAXDriver#resolveEntity + * @see #pushString + * @see #sourceType + * @see #pushInput + * @see #detectEncoding + * @see #sourceType + * @see #readBuffer + */ + private void pushURL ( + boolean isPE, + String ename, + String ids [], // public, system, baseURI + Reader reader, + InputStream stream, + String encoding, + boolean doResolve + ) throws SAXException, IOException + { + boolean ignoreEncoding; + String systemId; + InputSource source; + + if (!isPE) + dataBufferFlush (); + + scratch.setPublicId (ids [0]); + scratch.setSystemId (ids [1]); + + // See if we should skip or substitute the entity. + // If we're not skipping, resolving reports startEntity() + // and updates the (handler's) stack of URIs. + if (doResolve) { + // assert (stream == null && reader == null && encoding == null) + source = handler.resolveEntity (isPE, ename, scratch, ids [2]); + if (source == null) { + handler.warn ("skipping entity: " + ename); + handler.skippedEntity (ename); + if (isPE) + skippedPE = true; + return; + } + + // we might be using alternate IDs/encoding + systemId = source.getSystemId (); + // The following warning and setting systemId was deleted bcause + // the application has the option of not setting systemId + // provided that it has set the characte/byte stream. + /* + if (systemId == null) { + handler.warn ("missing system ID, using " + ids [1]); + systemId = ids [1]; + } + */ + } else { + // "[document]", or "[dtd]" via getExternalSubset() + scratch.setCharacterStream (reader); + scratch.setByteStream (stream); + scratch.setEncoding (encoding); + source = scratch; + systemId = ids [1]; + if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { + handler.startExternalEntity (ename, systemId, + "[document]" == ename); + } else { + handler.startExternalEntity (ename, systemId, + "[document]".equals(ename)); + } + } + + // we may have been given I/O streams directly + if (source.getCharacterStream () != null) { + if (source.getByteStream () != null) + error ("InputSource has two streams!"); + reader = source.getCharacterStream (); + } else if (source.getByteStream () != null) { + encoding = source.getEncoding (); + if (encoding == null) + stream = source.getByteStream (); + else try { + reader = new InputStreamReader ( + source.getByteStream (), + encoding); + } catch (IOException e) { + stream = source.getByteStream (); + } + } else if (systemId == null) + error ("InputSource has no URI!"); + scratch.setCharacterStream (null); + scratch.setByteStream (null); + scratch.setEncoding (null); + + // Push the existing status. + pushInput (ename); + + // Create a new read buffer. + // (Note the four-character margin) + readBuffer = new char [READ_BUFFER_MAX + 4]; + readBufferPos = 0; + readBufferLength = 0; + readBufferOverflow = -1; + is = null; + line = 1; + column = 0; + currentByteCount = 0; + + // If there's an explicit character stream, just + // ignore encoding declarations. + if (reader != null) { + sourceType = INPUT_READER; + this.reader = reader; + tryEncodingDecl (true); + return; + } + + // Else we handle the conversion, and need to ensure + // it's done right. + sourceType = INPUT_STREAM; + if (stream != null) { + is = stream; + } else { + // We have to open our own stream to the URL. + URL url = new URL (systemId); + + externalEntity = url.openConnection (); + externalEntity.connect (); + is = externalEntity.getInputStream (); + } + + // If we get to here, there must be + // an InputStream available. + if (!is.markSupported ()) { + is = new BufferedInputStream (is); + } + + // Get any external encoding label. + if (encoding == null && externalEntity != null) { + // External labels can be untrustworthy; filesystems in + // particular often have the wrong default for content + // that wasn't locally originated. Those we autodetect. + if (!"file".equals (externalEntity.getURL ().getProtocol ())) { + int temp; + + // application/xml;charset=something;otherAttr=... + // ... with many variants on 'something' + encoding = externalEntity.getContentType (); + + // MHK code (fix for Saxon 5.5.1/007): + // protect against encoding==null + if (encoding==null) { + temp = -1; + } else { + temp = encoding.indexOf ("charset"); + } + + // RFC 2376 sez MIME text defaults to ASCII, but since the + // JDK will create a MIME type out of thin air, we always + // autodetect when there's no explicit charset attribute. + if (temp < 0) + encoding = null; // autodetect + else { + // only this one attribute + if ((temp = encoding.indexOf (';')) > 0) + encoding = encoding.substring (0, temp); + + if ((temp = encoding.indexOf ('=', temp + 7)) > 0) { + encoding = encoding.substring (temp + 1); + + // attributes can have comment fields (RFC 822) + if ((temp = encoding.indexOf ('(')) > 0) + encoding = encoding.substring (0, temp); + // ... and values may be quoted + if ((temp = encoding.indexOf ('"')) > 0) + encoding = encoding.substring (temp + 1, + encoding.indexOf ('"', temp + 2)); + encoding.trim (); + } else { + handler.warn ("ignoring illegal MIME attribute: " + + encoding); + encoding = null; + } + } + } + } + + // if we got an external encoding label, use it ... + if (encoding != null) { + this.encoding = ENCODING_EXTERNAL; + setupDecoding (encoding); + ignoreEncoding = true; + + // ... else autodetect from first bytes. + } else { + detectEncoding (); + ignoreEncoding = false; + } + + // Read any XML or text declaration. + // If we autodetected, it may tell us the "real" encoding. + try { + tryEncodingDecl (ignoreEncoding); + } catch (UnsupportedEncodingException x) { + encoding = x.getMessage (); + + // if we don't handle the declared encoding, + // try letting a JVM InputStreamReader do it + try { + if (sourceType != INPUT_STREAM) + throw x; + + is.reset (); + readBufferPos = 0; + readBufferLength = 0; + readBufferOverflow = -1; + line = 1; + currentByteCount = column = 0; + + sourceType = INPUT_READER; + this.reader = new InputStreamReader (is, encoding); + is = null; + + tryEncodingDecl (true); + + } catch (IOException e) { + error ("unsupported text encoding", + encoding, + null); + } + } + } + + + /** + * Check for an encoding declaration. This is the second part of the + * XML encoding autodetection algorithm, relying on detectEncoding to + * get to the point that this part can read any encoding declaration + * in the document (using only US-ASCII characters). + * + * <p> Because this part starts to fill parser buffers with this data, + * it's tricky to setup a reader so that Java's built-in decoders can be + * used for the character encodings that aren't built in to this parser + * (such as EUC-JP, KOI8-R, Big5, etc). + * + * @return any encoding in the declaration, uppercased; or null + * @see detectEncoding + */ + private String tryEncodingDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + // Read the XML/text declaration. + if (tryRead ("<?xml")) { + if (tryWhitespace ()) { + if (inputStack.size () > 0) { + return parseTextDecl (ignoreEncoding); + } else { + return parseXMLDecl (ignoreEncoding); + } + } else { + // <?xml-stylesheet ...?> or similar + unread ('l'); + unread ('m'); + unread ('x'); + unread ('?'); + unread ('<'); + } + } + return null; + } + + + /** + * Attempt to detect the encoding of an entity. + * <p>The trick here (as suggested in the XML standard) is that + * any entity not in UTF-8, or in UCS-2 with a byte-order mark, + * <b>must</b> begin with an XML declaration or an encoding + * declaration; we simply have to look for "<?xml" in various + * encodings. + * <p>This method has no way to distinguish among 8-bit encodings. + * Instead, it sets up for UTF-8, then (possibly) revises its assumption + * later in setupDecoding (). Any ASCII-derived 8-bit encoding + * should work, but most will be rejected later by setupDecoding (). + * @see #tryEncoding (byte[], byte, byte, byte, byte) + * @see #tryEncoding (byte[], byte, byte) + * @see #setupDecoding + */ + private void detectEncoding () + throws SAXException, IOException + { + byte signature[] = new byte [4]; + + // Read the first four bytes for + // autodetection. + is.mark (4); + is.read (signature); + is.reset (); + + // + // FIRST: four byte encodings (who uses these?) + // + if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x3c)) { + // UCS-4 must begin with "<?xml" + // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) + // "UTF-32BE" + encoding = ENCODING_UCS_4_1234; + + } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00, + (byte) 0x00, (byte) 0x00)) { + // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) + // "UTF-32LE" + encoding = ENCODING_UCS_4_4321; + + } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, + (byte) 0x3c, (byte) 0x00)) { + // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) + encoding = ENCODING_UCS_4_2143; + + } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c, + (byte) 0x00, (byte) 0x00)) { + // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) + encoding = ENCODING_UCS_4_3412; + + // 00 00 fe ff UCS_4_1234 (with BOM) + // ff fe 00 00 UCS_4_4321 (with BOM) + } + + // + // SECOND: two byte encodings + // note ... with 1/14/2000 errata the XML spec identifies some + // more "broken UTF-16" autodetection cases, with no XML decl, + // which we don't handle here (that's legal too). + // + else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) { + // UCS-2 with a byte-order marker. (UTF-16) + // 0xfe 0xff: UCS-2, big-endian (12) + encoding = ENCODING_UCS_2_12; + is.read (); is.read (); + + } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) { + // UCS-2 with a byte-order marker. (UTF-16) + // 0xff 0xfe: UCS-2, little-endian (21) + encoding = ENCODING_UCS_2_21; + is.read (); is.read (); + + } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c, + (byte) 0x00, (byte) 0x3f)) { + // UTF-16BE (otherwise, malformed UTF-16) + // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark + encoding = ENCODING_UCS_2_12; + error ("no byte-order mark for UCS-2 entity"); + + } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00, + (byte) 0x3f, (byte) 0x00)) { + // UTF-16LE (otherwise, malformed UTF-16) + // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark + encoding = ENCODING_UCS_2_21; + error ("no byte-order mark for UCS-2 entity"); + } + + // + // THIRD: ASCII-derived encodings, fixed and variable lengths + // + else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f, + (byte) 0x78, (byte) 0x6d)) { + // ASCII derived + // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) + encoding = ENCODING_UTF_8; + prefetchASCIIEncodingDecl (); + + } else if (signature [0] == (byte) 0xef + && signature [1] == (byte) 0xbb + && signature [2] == (byte) 0xbf) { + // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) + // this un-needed notion slipped into XML 2nd ed through a + // "non-normative" erratum; now required by MSFT and UDDI, + // and E22 made it normative. + encoding = ENCODING_UTF_8; + is.read (); is.read (); is.read (); + + } else { + // 4c 6f a7 94 ... we don't understand EBCDIC flavors + // ... but we COULD at least kick in some fixed code page + + // (default) UTF-8 without encoding/XML declaration + encoding = ENCODING_UTF_8; + } + } + + + /** + * Check for a four-byte signature. + * <p>Utility routine for detectEncoding (). + * <p>Always looks for some part of "<?XML" in a specific encoding. + * @param sig The first four bytes read. + * @param b1 The first byte of the signature + * @param b2 The second byte of the signature + * @param b3 The third byte of the signature + * @param b4 The fourth byte of the signature + * @see #detectEncoding + */ + private static boolean tryEncoding ( + byte sig[], byte b1, byte b2, byte b3, byte b4) + { + return (sig [0] == b1 && sig [1] == b2 + && sig [2] == b3 && sig [3] == b4); + } + + + /** + * Check for a two-byte signature. + * <p>Looks for a UCS-2 byte-order mark. + * <p>Utility routine for detectEncoding (). + * @param sig The first four bytes read. + * @param b1 The first byte of the signature + * @param b2 The second byte of the signature + * @see #detectEncoding + */ + private static boolean tryEncoding (byte sig[], byte b1, byte b2) + { + return ((sig [0] == b1) && (sig [1] == b2)); + } + + + /** + * This method pushes a string back onto input. + * <p>It is useful either as the expansion of an internal entity, + * or for backtracking during the parse. + * <p>Call pushCharArray () to do the actual work. + * @param s The string to push back onto input. + * @see #pushCharArray + */ + private void pushString (String ename, String s) + throws SAXException + { + char ch[] = s.toCharArray (); + pushCharArray (ename, ch, 0, ch.length); + } + + + /** + * Push a new internal input source. + * <p>This method is useful for expanding an internal entity, + * or for unreading a string of characters. It creates a new + * readBuffer containing the characters in the array, instead + * of characters converted from an input byte stream. + * @param ch The char array to push. + * @see #pushString + * @see #pushURL + * @see #readBuffer + * @see #sourceType + * @see #pushInput + */ + private void pushCharArray (String ename, char ch[], int start, int length) + throws SAXException + { + // Push the existing status + pushInput (ename); + if (ename != null && doReport) { + dataBufferFlush (); + handler.startInternalEntity (ename); + } + sourceType = INPUT_INTERNAL; + readBuffer = ch; + readBufferPos = start; + readBufferLength = length; + readBufferOverflow = -1; + } + + + /** + * Save the current input source onto the stack. + * <p>This method saves all of the global variables associated with + * the current input source, so that they can be restored when a new + * input source has finished. It also tests for entity recursion. + * <p>The method saves the following global variables onto a stack + * using a fixed-length array: + * <ol> + * <li>sourceType + * <li>externalEntity + * <li>readBuffer + * <li>readBufferPos + * <li>readBufferLength + * <li>line + * <li>encoding + * </ol> + * @param ename The name of the entity (if any) causing the new input. + * @see #popInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void pushInput (String ename) + throws SAXException + { + // Check for entity recursion. + if (ename != null) { + Enumeration entities = entityStack.elements (); + while (entities.hasMoreElements ()) { + String e = (String) entities.nextElement (); + if (e != null && e == ename) { + error ("recursive reference to entity", ename, null); + } + } + } + entityStack.push (ename); + + // Don't bother if there is no current input. + if (sourceType == INPUT_NONE) { + return; + } + + // Set up a snapshot of the current + // input source. + Object input[] = new Object [12]; + + input [0] = new Integer (sourceType); + input [1] = externalEntity; + input [2] = readBuffer; + input [3] = new Integer (readBufferPos); + input [4] = new Integer (readBufferLength); + input [5] = new Integer (line); + input [6] = new Integer (encoding); + input [7] = new Integer (readBufferOverflow); + input [8] = is; + input [9] = new Integer (currentByteCount); + input [10] = new Integer (column); + input [11] = reader; + + // Push it onto the stack. + inputStack.push (input); + } + + + /** + * Restore a previous input source. + * <p>This method restores all of the global variables associated with + * the current input source. + * @exception java.io.EOFException + * If there are no more entries on the input stack. + * @see #pushInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void popInput () + throws SAXException, IOException + { + String ename = (String) entityStack.pop (); + + if (ename != null && doReport) + dataBufferFlush (); + switch (sourceType) { + case INPUT_STREAM: + handler.endExternalEntity (ename); + is.close (); + break; + case INPUT_READER: + handler.endExternalEntity (ename); + reader.close (); + break; + case INPUT_INTERNAL: + if (ename != null && doReport) + handler.endInternalEntity (ename); + break; + } + + // Throw an EOFException if there + // is nothing else to pop. + if (inputStack.isEmpty ()) { + throw new EOFException ("no more input"); + } + + Object input [] = (Object[]) inputStack.pop (); + + sourceType = ((Integer) input [0]).intValue (); + externalEntity = (URLConnection) input [1]; + readBuffer = (char[]) input [2]; + readBufferPos = ((Integer) input [3]).intValue (); + readBufferLength = ((Integer) input [4]).intValue (); + line = ((Integer) input [5]).intValue (); + encoding = ((Integer) input [6]).intValue (); + readBufferOverflow = ((Integer) input [7]).intValue (); + is = (InputStream) input [8]; + currentByteCount = ((Integer) input [9]).intValue (); + column = ((Integer) input [10]).intValue (); + reader = (Reader) input [11]; + } + + + /** + * Return true if we can read the expected character. + * <p>Note that the character will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the character again if the method succeeds. + * @param delim The character that should appear next. For a + * insensitive match, you must supply this in upper-case. + * @return true if the character was successfully read, or false if + * it was not. + * @see #tryRead (String) + */ + private boolean tryRead (char delim) + throws SAXException, IOException + { + char c; + + // Read the character + c = readCh (); + + // Test for a match, and push the character + // back if the match fails. + if (c == delim) { + return true; + } else { + unread (c); + return false; + } + } + + + /** + * Return true if we can read the expected string. + * <p>This is simply a convenience method. + * <p>Note that the string will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the string again if the method succeeds. + * <p>This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + * @param delim The string that should appear next. + * @return true if the string was successfully read, or false if + * it was not. + * @see #tryRead (char) + */ + private boolean tryRead (String delim) + throws SAXException, IOException + { + return tryRead (delim.toCharArray ()); + } + + private boolean tryRead (char ch []) + throws SAXException, IOException + { + char c; + + // Compare the input, character- + // by character. + + for (int i = 0; i < ch.length; i++) { + c = readCh (); + if (c != ch [i]) { + unread (c); + if (i != 0) { + unread (ch, i); + } + return false; + } + } + return true; + } + + + + /** + * Return true if we can read some whitespace. + * <p>This is simply a convenience method. + * <p>This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + * @return true if whitespace was found. + */ + private boolean tryWhitespace () + throws SAXException, IOException + { + char c; + c = readCh (); + if (isWhitespace (c)) { + skipWhitespace (); + return true; + } else { + unread (c); + return false; + } + } + + + /** + * Read all data until we find the specified string. + * This is useful for scanning CDATA sections and PIs. + * <p>This is inefficient right now, since it calls tryRead () + * for every character. + * @param delim The string delimiter + * @see #tryRead (String, boolean) + * @see #readCh + */ + private void parseUntil (String delim) + throws SAXException, IOException + { + parseUntil (delim.toCharArray ()); + } + + private void parseUntil (char delim []) + throws SAXException, IOException + { + char c; + int startLine = line; + + try { + while (!tryRead (delim)) { + c = readCh (); + dataBufferAppend (c); + } + } catch (EOFException e) { + error ("end of input while looking for delimiter " + + "(started on line " + startLine + + ')', null, new String (delim)); + } + } + + + ////////////////////////////////////////////////////////////////////// + // Low-level I/O. + ////////////////////////////////////////////////////////////////////// + + + /** + * Prefetch US-ASCII XML/text decl from input stream into read buffer. + * Doesn't buffer more than absolutely needed, so that when an encoding + * decl says we need to create an InputStreamReader, we can discard our + * buffer and reset(). Caller knows the first chars of the decl exist + * in the input stream. + */ + private void prefetchASCIIEncodingDecl () + throws SAXException, IOException + { + int ch; + readBufferPos = readBufferLength = 0; + + is.mark (readBuffer.length); + while (true) { + ch = is.read (); + readBuffer [readBufferLength++] = (char) ch; + switch (ch) { + case (int) '>': + return; + case -1: + error ("file ends before end of XML or encoding declaration.", + null, "?>"); + } + if (readBuffer.length == readBufferLength) + error ("unfinished XML or encoding declaration"); + } + } + + /** + * Read a chunk of data from an external input source. + * <p>This is simply a front-end that fills the rawReadBuffer + * with bytes, then calls the appropriate encoding handler. + * @see #encoding + * @see #rawReadBuffer + * @see #readBuffer + * @see #filterCR + * @see #copyUtf8ReadBuffer + * @see #copyIso8859_1ReadBuffer + * @see #copyUcs_2ReadBuffer + * @see #copyUcs_4ReadBuffer + */ + private void readDataChunk () + throws SAXException, IOException + { + int count; + + // See if we have any overflow (filterCR sets for CR at end) + if (readBufferOverflow > -1) { + readBuffer [0] = (char) readBufferOverflow; + readBufferOverflow = -1; + readBufferPos = 1; + sawCR = true; + } else { + readBufferPos = 0; + sawCR = false; + } + + // input from a character stream. + if (sourceType == INPUT_READER) { + count = reader.read (readBuffer, + readBufferPos, READ_BUFFER_MAX - readBufferPos); + if (count < 0) + readBufferLength = readBufferPos; + else + readBufferLength = readBufferPos + count; + if (readBufferLength > 0) + filterCR (count >= 0); + sawCR = false; + return; + } + + // Read as many bytes as possible into the raw buffer. + count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX); + + // Dispatch to an encoding-specific reader method to populate + // the readBuffer. In most parser speed profiles, these routines + // show up at the top of the CPU usage chart. + if (count > 0) { + switch (encoding) { + // one byte builtins + case ENCODING_ASCII: + copyIso8859_1ReadBuffer (count, (char) 0x0080); + break; + case ENCODING_UTF_8: + copyUtf8ReadBuffer (count); + break; + case ENCODING_ISO_8859_1: + copyIso8859_1ReadBuffer (count, (char) 0); + break; + + // two byte builtins + case ENCODING_UCS_2_12: + copyUcs2ReadBuffer (count, 8, 0); + break; + case ENCODING_UCS_2_21: + copyUcs2ReadBuffer (count, 0, 8); + break; + + // four byte builtins + case ENCODING_UCS_4_1234: + copyUcs4ReadBuffer (count, 24, 16, 8, 0); + break; + case ENCODING_UCS_4_4321: + copyUcs4ReadBuffer (count, 0, 8, 16, 24); + break; + case ENCODING_UCS_4_2143: + copyUcs4ReadBuffer (count, 16, 24, 0, 8); + break; + case ENCODING_UCS_4_3412: + copyUcs4ReadBuffer (count, 8, 0, 24, 16); + break; + } + } else + readBufferLength = readBufferPos; + + readBufferPos = 0; + + // Filter out all carriage returns if we've seen any + // (including any saved from a previous read) + if (sawCR) { + filterCR (count >= 0); + sawCR = false; + + // must actively report EOF, lest some CRs get lost. + if (readBufferLength == 0 && count >= 0) + readDataChunk (); + } + + if (count > 0) + currentByteCount += count; + } + + + /** + * Filter carriage returns in the read buffer. + * CRLF becomes LF; CR becomes LF. + * @param moreData true iff more data might come from the same source + * @see #readDataChunk + * @see #readBuffer + * @see #readBufferOverflow + */ + private void filterCR (boolean moreData) + { + int i, j; + + readBufferOverflow = -1; + +loop: + for (i = j = readBufferPos; j < readBufferLength; i++, j++) { + switch (readBuffer [j]) { + case '\r': + if (j == readBufferLength - 1) { + if (moreData) { + readBufferOverflow = '\r'; + readBufferLength--; + } else // CR at end of buffer + readBuffer [i++] = '\n'; + break loop; + } else if (readBuffer [j + 1] == '\n') { + j++; + } + readBuffer [i] = '\n'; + break; + + case '\n': + default: + readBuffer [i] = readBuffer [j]; + break; + } + } + readBufferLength = i; + } + + /** + * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. + * <p>When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * <p>Note that as of Unicode 3.1, good practice became a requirement, + * so that each Unicode character has exactly one UTF-8 representation. + * @param count The number of bytes to convert. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + * @see #getNextUtf8Byte + */ + private void copyUtf8ReadBuffer (int count) + throws SAXException, IOException + { + int i = 0; + int j = readBufferPos; + int b1; + char c = 0; + + /* + // check once, so the runtime won't (if it's smart enough) + if (count < 0 || count > rawReadBuffer.length) + throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); + */ + + while (i < count) { + b1 = rawReadBuffer [i++]; + + // Determine whether we are dealing + // with a one-, two-, three-, or four- + // byte sequence. + if (b1 < 0) { + if ((b1 & 0xe0) == 0xc0) { + // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx + c = (char) (((b1 & 0x1f) << 6) + | getNextUtf8Byte (i++, count)); + if (c < 0x0080) + encodingError ("Illegal two byte UTF-8 sequence", + c, 0); + //Sec 2.11 + // [1] the two-character sequence #xD #xA + // [2] the two-character sequence #xD #x85 + if ((c == 0x0085 || c == 0x000a) && sawCR) + continue; + + // Sec 2.11 + // [3] the single character #x85 + + if(c == 0x0085 && xmlVersion == XML_11) + readBuffer[j++] = '\r'; + } else if ((b1 & 0xf0) == 0xe0) { + // 3-byte sequence: + // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx + // most CJKV characters + c = (char) (((b1 & 0x0f) << 12) | + (getNextUtf8Byte (i++, count) << 6) | + getNextUtf8Byte (i++, count)); + //sec 2.11 + //[4] the single character #x2028 + if(c == 0x2028 && xmlVersion == XML_11){ + readBuffer[j++] = '\r'; + sawCR = true; + continue; + } + if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) + encodingError ("Illegal three byte UTF-8 sequence", + c, 0); + } else if ((b1 & 0xf8) == 0xf0) { + // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx + // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + // (uuuuu = wwww + 1) + // "Surrogate Pairs" ... from the "Astral Planes" + // Unicode 3.1 assigned the first characters there + int iso646 = b1 & 07; + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + + if (iso646 <= 0xffff) { + encodingError ("Illegal four byte UTF-8 sequence", + iso646, 0); + } else { + if (iso646 > 0x0010ffff) + encodingError ( + "UTF-8 value out of range for Unicode", + iso646, 0); + iso646 -= 0x010000; + readBuffer [j++] = (char) (0xd800 | (iso646 >> 10)); + readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff)); + continue; + } + } else { + // The five and six byte encodings aren't supported; + // they exceed the Unicode (and XML) range. + encodingError ( + "unsupported five or six byte UTF-8 sequence", + 0xff & b1, i); + // NOTREACHED + c = 0; + } + } else { + // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx + // (US-ASCII character, "common" case, one branch to here) + c = (char) b1; + } + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + // How many characters have we read? + readBufferLength = j; + } + + + /** + * Return the next byte value in a UTF-8 sequence. + * If it is not possible to get a byte from the current + * entity, throw an exception. + * @param pos The current position in the rawReadBuffer. + * @param count The number of bytes in the rawReadBuffer + * @return The significant six bits of a non-initial byte in + * a UTF-8 sequence. + * @exception EOFException If the sequence is incomplete. + */ + private int getNextUtf8Byte (int pos, int count) + throws SAXException, IOException + { + int val; + + // Take a character from the buffer + // or from the actual input stream. + if (pos < count) { + val = rawReadBuffer [pos]; + } else { + val = is.read (); + if (val == -1) { + encodingError ("unfinished multi-byte UTF-8 sequence at EOF", + -1, pos); + } + } + + // Check for the correct bits at the start. + if ((val & 0xc0) != 0x80) { + encodingError ("bad continuation of multi-byte UTF-8 sequence", + val, pos + 1); + } + + // Return the significant bits. + return (val & 0x3f); + } + + + /** + * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into + * UTF-16 characters. + * + * <p>When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * + * @param count The number of bytes to convert. + * @param mask For ASCII conversion, 0x7f; else, 0xff. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyIso8859_1ReadBuffer (int count, char mask) + throws IOException + { + int i, j; + for (i = 0, j = readBufferPos; i < count; i++, j++) { + char c = (char) (rawReadBuffer [i] & 0xff); + if ((c & mask) != 0) + throw new CharConversionException ("non-ASCII character U+" + + Integer.toHexString (c)); + if (c == 0x0085 && xmlVersion == XML_11) + c = '\r'; + readBuffer [j] = c; + if (c == '\r') { + sawCR = true; + } + } + readBufferLength = j; + } + + + /** + * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters + * (as used in Java string manipulation). + * + * <p>When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs2ReadBuffer (int count, int shift1, int shift2) + throws SAXException + { + int j = readBufferPos; + + if (count > 0 && (count % 2) != 0) { + encodingError ("odd number of bytes in UCS-2 encoding", -1, count); + } + // The loops are faster with less internal brancing; hence two + if (shift1 == 0) { // "UTF-16-LE" + for (int i = 0; i < count; i += 2) { + char c = (char) (rawReadBuffer [i + 1] << 8); + c |= 0xff & rawReadBuffer [i]; + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + } else { // "UTF-16-BE" + for (int i = 0; i < count; i += 2) { + char c = (char) (rawReadBuffer [i] << 8); + c |= 0xff & rawReadBuffer [i + 1]; + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + } + readBufferLength = j; + } + + + /** + * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. + * + * <p>When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * <p>Java has Unicode chars, and this routine uses surrogate pairs + * for ISO-10646 values between 0x00010000 and 0x000fffff. An + * exception is thrown if the ISO-10646 character has no Unicode + * representation. + * + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @param shift3 The number of bits to shift byte 2 + * @param shift4 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs4ReadBuffer (int count, int shift1, int shift2, + int shift3, int shift4) + throws SAXException + { + int j = readBufferPos; + + if (count > 0 && (count % 4) != 0) { + encodingError ( + "number of bytes in UCS-4 encoding not divisible by 4", + -1, count); + } + for (int i = 0; i < count; i += 4) { + int value = (((rawReadBuffer [i] & 0xff) << shift1) | + ((rawReadBuffer [i + 1] & 0xff) << shift2) | + ((rawReadBuffer [i + 2] & 0xff) << shift3) | + ((rawReadBuffer [i + 3] & 0xff) << shift4)); + if (value < 0x0000ffff) { + readBuffer [j++] = (char) value; + if (value == (int) '\r') { + sawCR = true; + } + } else if (value < 0x0010ffff) { + value -= 0x010000; + readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); + readBuffer [j++] = (char) (0xdc | (value & 0x03ff)); + } else { + encodingError ("UCS-4 value out of range for Unicode", + value, i); + } + } + readBufferLength = j; + } + + + /** + * Report a character encoding error. + */ + private void encodingError (String message, int value, int offset) + throws SAXException + { + if (value != -1) + message = message + " (character code: 0x" + + Integer.toHexString (value) + ')'; + error (message); + } + + + ////////////////////////////////////////////////////////////////////// + // Local Variables. + ////////////////////////////////////////////////////////////////////// + + /** + * Re-initialize the variables for each parse. + */ + private void initializeVariables () + { + // First line + line = 1; + column = 0; + + // Set up the buffers for data and names + dataBufferPos = 0; + dataBuffer = new char [DATA_BUFFER_INITIAL]; + nameBufferPos = 0; + nameBuffer = new char [NAME_BUFFER_INITIAL]; + + // Set up the DTD hash tables + elementInfo = new Hashtable (); + entityInfo = new Hashtable (); + notationInfo = new Hashtable (); + skippedPE = false; + + // Set up the variables for the current + // element context. + currentElement = null; + currentElementContent = CONTENT_UNDECLARED; + + // Set up the input variables + sourceType = INPUT_NONE; + inputStack = new Stack (); + entityStack = new Stack (); + externalEntity = null; + tagAttributePos = 0; + tagAttributes = new String [100]; + rawReadBuffer = new byte [READ_BUFFER_MAX]; + readBufferOverflow = -1; + + scratch = new InputSource (); + + inLiteral = false; + expandPE = false; + peIsError = false; + + doReport = false; + + inCDATA = false; + + symbolTable = new Object [SYMBOL_TABLE_LENGTH][]; + } + + + // + // The current XML handler interface. + // + private SAXDriver handler; + + // + // I/O information. + // + private Reader reader; // current reader + private InputStream is; // current input stream + private int line; // current line number + private int column; // current column number + private int sourceType; // type of input source + private Stack inputStack; // stack of input soruces + private URLConnection externalEntity; // current external entity + private int encoding; // current character encoding + private int currentByteCount; // bytes read from current source + private InputSource scratch; // temporary + + // + // Buffers for decoded but unparsed character input. + // + private char readBuffer []; + private int readBufferPos; + private int readBufferLength; + private int readBufferOverflow; // overflow from last data chunk. + + + // + // Buffer for undecoded raw byte input. + // + private final static int READ_BUFFER_MAX = 16384; + private byte rawReadBuffer []; + + + // + // Buffer for attribute values, char refs, DTD stuff. + // + private static int DATA_BUFFER_INITIAL = 4096; + private char dataBuffer []; + private int dataBufferPos; + + // + // Buffer for parsed names. + // + private static int NAME_BUFFER_INITIAL = 1024; + private char nameBuffer []; + private int nameBufferPos; + + // + // Save any standalone flag + // + private boolean docIsStandalone; + + // + // Hashtables for DTD information on elements, entities, and notations. + // Populated until we start ignoring decls (because of skipping a PE) + // + private Hashtable elementInfo; + private Hashtable entityInfo; + private Hashtable notationInfo; + private boolean skippedPE; + + + // + // Element type currently in force. + // + private String currentElement; + private int currentElementContent; + + // + // Stack of entity names, to detect recursion. + // + private Stack entityStack; + + // + // PE expansion is enabled in most chunks of the DTD, not all. + // When it's enabled, literals are treated differently. + // + private boolean inLiteral; + private boolean expandPE; + private boolean peIsError; + + // + // can't report entity expansion inside two constructs: + // - attribute expansions (internal entities only) + // - markup declarations (parameter entities only) + // + private boolean doReport; + + // + // Symbol table, for caching interned names. + // + // These show up wherever XML names or nmtokens are used: naming elements, + // attributes, PIs, notations, entities, and enumerated attribute values. + // + // NOTE: This hashtable doesn't grow. The default size is intended to be + // rather large for most documents. Example: one snapshot of the DocBook + // XML 4.1 DTD used only about 350 such names. As a rule, only pathological + // documents (ones that don't reuse names) should ever see much collision. + // + // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. + // "2039" keeps the hash table size at about two memory pages on typical + // 32 bit hardware. + // + private final static int SYMBOL_TABLE_LENGTH = 2039; + + private Object symbolTable [][]; + + // + // Hash table of attributes found in current start tag. + // + private String tagAttributes []; + private int tagAttributePos; + + // + // Utility flag: have we noticed a CR while reading the last + // data chunk? If so, we will have to go back and normalise + // CR or CR/LF line ends. + // + private boolean sawCR; + + // + // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. + // + private boolean inCDATA; + + // + // Xml version. + // + private static final int XML_10 = 0; + private static final int XML_11 = 1; + private int xmlVersion = XML_10; +} diff --git a/libjava/gnu/xml/aelfred2/XmlReader.java b/libjava/gnu/xml/aelfred2/XmlReader.java new file mode 100644 index 0000000..96c9c72 --- /dev/null +++ b/libjava/gnu/xml/aelfred2/XmlReader.java @@ -0,0 +1,315 @@ +/* XmlReader.java -- + Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.xml.aelfred2; + +import java.io.IOException; +import java.util.Locale; + +import org.xml.sax.*; +import org.xml.sax.ext.*; + +import gnu.xml.pipeline.EventFilter; +import gnu.xml.pipeline.ValidationConsumer; + + +/** + * This SAX2 parser optionally layers a validator over the Ælfred2 + * SAX2 parser. While this will not evaluate every XML validity constraint, + * it does support all the validity constraints that are of any real utility + * outside the strict SGML-compatible world. See the documentation for the + * SAXDriver class for information about the SAX2 features and properties + * that are supported, and documentation for the ValidationConsumer for + * information about what validity constraints may not be supported. + * (Ælfred2 tests some of those, even in non-validating mode, to + * achieve better conformance.) + * + * <p> Note that due to its internal construction, you can't change most + * handlers until parse() returns. This diverges slightly from SAX, which + * expects later binding to be supported. Early binding involves less + * runtime overhead, which is an issue for event pipelines as used inside + * this parser. Rather than relying on the parser to handle late binding + * to your own handlers, do it yourself. + * + * @see SAXDriver + * @see gnu.xml.pipeline.ValidationConsumer + * + * @author David Brownell + */ +public final class XmlReader implements XMLReader +{ + private SAXDriver aelfred2 = new SAXDriver (); + private EventFilter filter = new EventFilter (); + private boolean isValidating; + private boolean active; + + + /** Constructs a SAX Parser. */ + public XmlReader () + { } + + /** + * Constructs a SAX Parser, optionally treating validity errors + * as if they were fatal errors. + */ + public XmlReader (boolean invalidIsFatal) + { + if (invalidIsFatal) + setErrorHandler (new DefaultHandler2 () { + public void error (SAXParseException e) + throws SAXException + { throw e; } + }); + } + + /** + * <b>SAX2</b>: Returns the object used to report the logical + * content of an XML document. + */ + public ContentHandler getContentHandler () + { return filter.getContentHandler (); } + + /** + * <b>SAX2</b>: Assigns the object used to report the logical + * content of an XML document. + * @exception IllegalStateException if called mid-parse + */ + public void setContentHandler (ContentHandler handler) + { + if (active) + throw new IllegalStateException ("already parsing"); + filter.setContentHandler (handler); + } + + /** + * <b>SAX2</b>: Returns the object used to process declarations related + * to notations and unparsed entities. + */ + public DTDHandler getDTDHandler () + { return filter.getDTDHandler (); } + + /** + * <b>SAX1</b> Assigns DTD handler + * @exception IllegalStateException if called mid-parse + */ + public void setDTDHandler (DTDHandler handler) + { + if (active) + throw new IllegalStateException ("already parsing"); + filter.setDTDHandler (handler); + } + + /** + * <b>SAX2</b>: Returns the object used when resolving external + * entities during parsing (both general and parameter entities). + */ + public EntityResolver getEntityResolver () + { return aelfred2.getEntityResolver (); } + + /** <b>SAX1</b> Assigns parser's entity resolver */ + public void setEntityResolver (EntityResolver handler) + { aelfred2.setEntityResolver (handler); } + + /** + * <b>SAX2</b>: Returns the object used to receive callbacks for XML + * errors of all levels (fatal, nonfatal, warning); this is never null; + */ + public ErrorHandler getErrorHandler () + { return aelfred2.getErrorHandler (); } + + /** + * <b>SAX1</b> Assigns error handler + * @exception IllegalStateException if called mid-parse + */ + public void setErrorHandler (ErrorHandler handler) + { + if (active) + throw new IllegalStateException ("already parsing"); + aelfred2.setErrorHandler (handler); + } + + /** + * <b>SAX2</b>: Assigns the specified property. + * @exception IllegalStateException if called mid-parse + */ + public void setProperty (String propertyId, Object value) + throws SAXNotRecognizedException, SAXNotSupportedException + { + if (active) + throw new IllegalStateException ("already parsing"); + if (getProperty (propertyId) != value) + filter.setProperty (propertyId, value); + } + + /** + * <b>SAX2</b>: Returns the specified property. + */ + public Object getProperty (String propertyId) + throws SAXNotRecognizedException + { + if ((SAXDriver.PROPERTY + "declaration-handler") + .equals (propertyId) + || (SAXDriver.PROPERTY + "lexical-handler") + .equals (propertyId)) + return filter.getProperty (propertyId); + throw new SAXNotRecognizedException (propertyId); + } + + private void forceValidating () + throws SAXNotRecognizedException, SAXNotSupportedException + { + aelfred2.setFeature ( + SAXDriver.FEATURE + "namespace-prefixes", + true); + aelfred2.setFeature ( + SAXDriver.FEATURE + "external-general-entities", + true); + aelfred2.setFeature ( + SAXDriver.FEATURE + "external-parameter-entities", + true); + } + + /** + * <b>SAX2</b>: Sets the state of features supported in this parser. + * Note that this parser requires reporting of namespace prefixes when + * validating. + */ + public void setFeature (String featureId, boolean state) + throws SAXNotRecognizedException, SAXNotSupportedException + { + boolean value = getFeature (featureId); + + if (state == value) + return; + + if ((SAXDriver.FEATURE + "validation").equals (featureId)) { + if (active) + throw new SAXNotSupportedException ("already parsing"); + if (state) + forceValidating (); + isValidating = state; + } else + aelfred2.setFeature (featureId, state); + } + + /** + * <b>SAX2</b>: Tells whether this parser supports the specified feature. + * At this time, this directly parallels the underlying SAXDriver, + * except that validation is optionally supported. + * + * @see SAXDriver + */ + public boolean getFeature (String featureId) + throws SAXNotRecognizedException, SAXNotSupportedException + { + if ((SAXDriver.FEATURE + "validation").equals (featureId)) + return isValidating; + + return aelfred2.getFeature (featureId); + } + + /** + * <b>SAX1</b>: Sets the locale used for diagnostics; currently, + * only locales using the English language are supported. + * @param locale The locale for which diagnostics will be generated + */ + public void setLocale (Locale locale) + throws SAXException + { aelfred2.setLocale (locale); } + + /** + * <b>SAX1</b>: Preferred API to parse an XML document, using a + * system identifier (URI). + */ + public void parse (String systemId) + throws SAXException, IOException + { + parse (new InputSource (systemId)); + } + + /** + * <b>SAX1</b>: Underlying API to parse an XML document, used + * directly when no URI is available. When this is invoked, + * and the parser is set to validate, some features will be + * automatically reset to appropriate values: for reporting + * namespace prefixes, and incorporating external entities. + * + * @param source The XML input source. + * + * @exception IllegalStateException if called mid-parse + * @exception SAXException The handlers may throw any SAXException, + * and the parser normally throws SAXParseException objects. + * @exception IOException IOExceptions are normally through through + * the parser if there are problems reading the source document. + */ + public void parse (InputSource source) + throws SAXException, IOException + { + EventFilter next; + boolean nsdecls; + + synchronized (aelfred2) { + if (active) + throw new IllegalStateException ("already parsing"); + active = true; + } + + // set up the output pipeline + if (isValidating) { + forceValidating (); + next = new ValidationConsumer (filter); + } else + next = filter; + + // connect pipeline and error handler + // don't let _this_ call to bind() affect xmlns* attributes + nsdecls = aelfred2.getFeature ( + SAXDriver.FEATURE + "namespace-prefixes"); + EventFilter.bind (aelfred2, next); + if (!nsdecls) + aelfred2.setFeature ( + SAXDriver.FEATURE + "namespace-prefixes", + false); + + // parse, clean up + try { + aelfred2.parse (source); + } finally { + active = false; + } + } +} diff --git a/libjava/gnu/xml/aelfred2/package.html b/libjava/gnu/xml/aelfred2/package.html new file mode 100644 index 0000000..e204258 --- /dev/null +++ b/libjava/gnu/xml/aelfred2/package.html @@ -0,0 +1,506 @@ +<!DOCTYPE html PUBLIC + '-//W3C//DTD XHTML 1.0 Transitional//EN' + 'http://www.w3.org/TR/xhtml1/DTD/transitional.dtd'> + +<html><head> + <title>package overview</title> +<!-- +/* + * Copyright (C) 1999,2000,2001 The Free Software Foundation, Inc. + */ +--> +</head><body> + +<p> This package contains Ælfred2, which includes an +enhanced SAX2-compatible version of the Ælfred +non-validating XML parser, a modular (and hence optional) +DTD validating parser, and modular (and hence optional) +JAXP glue to those. +Use these like any other SAX2 parsers. </p> + +<ul> + <li><a href="#about">About Ælfred</a><ul> + <li><a href="#principles">Design Principles</a></li> + <li><a href="#name">About the Name Ælfred</a></li> + <li><a href="#encodings">Character Encodings</a></li> + <li><a href="#violations">Known Conformance Violations</a></li> + <li><a href="#copyright">Licensing</a></li> + </ul></li> + + <li><a href="#changes">Changes Since the Last Microstar Release</a><ul> + <li><a href="#sax2">SAX2 Support</a></li> + <li><a href="#validation">Validation</a></li> + <li><a href="#smaller">You Want Smaller?</a></li> + <li><a href="#bugfixes">Bugs Fixed</a></li> + </ul></li> + +</ul> + +<h2><a name="about">About Ælfred</a></h2> + +<p>Ælfred is a XML parser written in the java programming language. + +<h3><a name="principles">Design Principles</a></h3> + +<p>In most Java applets and applications, XML should not be the central +feature; instead, XML is the means to another end, such as loading +configuration information, reading meta-data, or parsing transactions.</p> + +<p> When an XML parser is only a single component of a much larger +program, it cannot be large, slow, or resource-intensive. With Java +applets, in particular, code size is a significant issue. The standard +modem is still not operating at 56 Kbaud, or sometimes even with data +compression. Assuming an uncompressed 28.8 Kbaud modem, only about +3 KBytes can be downloaded in one second; compression often doubles +that speed, but a V.90 modem may not provide another doubling. When +used with embedded processors, similar size concerns apply. </p> + +<p> Ælfred is designed for easy and efficient use over the Internet, +based on the following principles: </p> <ol> + +<li> Ælfred must be as small as possible, so that it doesn't add too + much to an applet's download time. </li> + +<li> Ælfred must use as few class files as possible, to minimize the + number of HTTP connections necessary. (The use of JAR files has made this + be less of a concern.) </li> + +<li> Ælfred must be compatible with most or all Java implementations + and platforms. (Write once, run anywhere.) </li> + +<li> Ælfred must use as little memory as possible, so that it does + not take away resources from the rest of your program. (It doesn't force + you to use DOM or a similar costly data structure API.)</li> + +<li> Ælfred must run as fast as possible, so that it does not slow down + the rest of your program. </li> + +<li> Ælfred must produce correct output for well-formed and valid + documents, but need not reject every document that is not valid or + not well-formed. (In Ælfred2, correctness was a bigger concern + than in the original version; and a validation option is available.) </li> + +<li> Ælfred must provide full internationalization from the first + release. (Ælfred2 now automatically handles all encodings + supported by the underlying JVM; previous versions handled only + UTF-8, UTF_16, ASCII, and ISO-8859-1.)</li> + +</ol> + +<p>As you can see from this list, Ælfred is designed for production +use, but neither validation nor perfect conformance was a requirement. +Good validating parsers exist, including one in this package, +and you should use them as appropriate. (See conformance reviews +available at <a href="http://www.xml.com/">http://www.xml.com</a>) +</p> + +<p> One of the main goals of Ælfred2 was to significantly improve +conformance, while not significantly affecting the other goals stated above. +Since the only use of this parser is with SAX, some classes could be +removed, and so the overall size of Ælfred was actually reduced. +Subsequent performance work produced a notable speedup (over twenty +percent on larger files). That is, the tradeoffs between speed, size, and +conformance were re-targeted towards conformance and support of newer APIs +(SAX2), with a a positive performance impact. </p> + +<p> The role anticipated for this version of Ælfred is as a +lightweight Free Software SAX parser that can be used in essentially every +Java program where the handful of conformance violations (noted below) +are acceptable. +That certainly includes applets, and +nowadays one must also mention embedded systems as being even more +size-critical. +At this writing, all parsers that are more conformant are +significantly larger, even when counting the optional +validation support in this version of Ælfred. </p> + + +<h3><a name="name">About the Name <em>Ælfred</em></a></h3> + +<p>Ælfred the Great (AElfred in ASCII) was King of Wessex, and +some say of King of England, at the time of his death in 899 AD. +Ælfred introduced a wide-spread literacy program in the hope that +his people would learn to read English, at least, if Latin was too +difficult for them. This Ælfred hopes to bring another sort of +literacy to Java, using XML, at least, if full SGML is too difficult.</p> + +<p>The initial Æ ligature ("AE)" is also a reminder that XML is +not limited to ASCII.</p> + + +<h3><a name="encodings">Character Encodings</a></h3> + +<p> The Ælfred parser currently builds in support for a handful +of input encodings. Of course these include UTF-8 and UTF-16, which +all XML parsers are required to support:</p> <ul> + + <li> UTF-8 ... the standard eight bit encoding, used unless + you provide an encoding declaration or a MIME charset tag.</li> + + <li> US-ASCII ... an extremely common seven bit encoding, + which happens to be a subset of UTF-8 and ISO-8859-1 as well + as many other encodings. XHTML web pages using US-ASCII + (without an encoding declaration) are probably more + widely interoperable than those in any other encoding. </li> + + <li> ISO-8859-1 ... includes accented characters used in + much of western Europe (but excluding the Euro currency + symbol).</li> + + <li> UTF-16 ... with several variants, this encodes each + sixteen bit Unicode character in sixteen bits of output. + Variants include UTF-16BE (big endian, no byte order mark), + UTF-16LE (little endian, no byte order mark), and + ISO-10646-UCS-2 (an older and less used encoding, using a + version of Unicode without surrogate pairs). This is + essentially the native encoding used by Java. </li> + + <li> ISO-10646-UCS-4 ... a seldom-used four byte encoding, + also known as UTF-32BE. Four byte order variants are supported, + including one known as UTF-32LE. Some operating systems + standardized on UCS-4 despite its significant size penalty, + in anticipation that Unicode (even with surrogate pairs) + would eventually become limiting. UCS-4 permits encoding + of non-Unicode characters, which Java can't represent (and + XML doesn't allow). + </li> + + </ul> + +<p> If you use any encoding other than UTF-8 or UTF-16 you should +make sure to label your data appropriately: </p> + +<blockquote> +<?xml version="1.0" encoding="<b>ISO-8859-15</b>"?> +</blockquote> + +<p> Encodings accessed through <code>java.io.InputStreamReader</code> +are now fully supported for both external labels (such as MIME types) +and internal types (as shown above). +There is one limitation in the support for internal labels: +the encodings must be derived from the US-ASCII encoding, +the EBCDIC family of encodings is not recognized. +Note that Java defines its +own encoding names, which don't always correspond to the standard +Internet encoding names defined by the IETF/IANA, and that Java +may even <em>require</em> use of nonstandard encoding names. +Please report +such problems; some of them can be worked around in this parser, +and many can be worked around by using external labels. +</p> + +<p>Note that if you are using the Euro symbol with an fixed length +eight bit encoding, you should probably be using the encoding label +<em>iso-8859-15</em> or, with a Microsoft OS, <em>cp-1252</em>. +Of course, UTF-8 and UTF-16 handle the Euro symbol directly. +</p> + + +<h3><a name="violations">Known Conformance Violations</a></h3> + +<p>Known conformance issues should be of negligible importance for +most applications, and include: </p><ul> + + <li> Rather than following the voluminous "Appendix B" rules about + what characters may appear in names (and name tokens), the Unicode + rules embedded in <em>java.lang.Character</em> are used. + This means mostly that some names are inappropriately accepted, + though a few are inappropriately rejected. (It's much simpler + to avoid that much special case code. Recent OASIS/NIST test + cases may have these rules be realistically testable.) </li> + + <li> Text containing "]]>" is not rejected unless it fully resides + in an internal buffer ... which is, thankfully, the typical case. This + text is illegal, but sometimes appears in illegal attempts to + nest CDATA sections. (Not catching that boundary condition + substantially simplifies parsing text.) </li> + + <li> Surrogate characters that aren't correctly paired are ignored + rather than rejected, unless they were encoded using UTF-8. (This + simplifies parsing text.) Unicode 3.1 assigned the first characters + to those character codes, in early 2001, so few documents (or tools) + use such characters in any case. </li> + + <li> Declarations following references to an undefined parameter + entity reference are not ignored. (Not maintaining and using state + about this validity error simplifies declaration handling; few + XML parsers address this constraint in any case.) </li> + + <li> Well formedness constraints for general entity references + are not enforced. (The code to handle the "content" production + is merged with the element parsing code, making it hard to reuse + for this additional situation.) </li> + +</ul> + +<p> When tested against the July 12, 1999 version of the OASIS +XML Conformance test suite, an earlier version passed 1057 of 1067 tests. +That contrasts with the original version, which passed 867. The +current parser is top-ranked in terms of conformance, as is its +validating sibling (which has some additional conformance violations +imposed on it by SAX2 API deficiencies as well as some of the more +curious SGML layering artifacts found in the XML specification). </p> + +<p> The XML 1.0 specification itself was not without problems, +and after some delays the W3C has come out with a revised +"second edition" specification. While that doesn't resolve all +the problems identified the XML specification, many of the most +egregious problems have been resolved. (You still need to drink +magic Kool-Aid before some DTD-related issues make sense.) +To the extent possible, this parser conforms to that second +edition specification, and does well against corrected versions +of the OASIS/NIST XML conformance test cases. See <a href= +"http://xmlconf.sourceforge.net">http://xmlconf.sourceforge.net</a> +for more information about SAX2/XML conformance testing. </p> + + +<h3><a name="copyright">Copyright and distribution terms</a></h3> + +<p> +The software in this package is distributed under the GNU General Public +License (with a special exception described below). +</p> + +<p> +A copy of GNU General Public License (GPL) is included in this distribution, +in the file COPYING. If you do not have the source code, it is available at: + + <a href="http://www.gnu.org/software/classpath/">http://www.gnu.org/software/classpath/</a> +</p> + +<pre> + Linking this library statically or dynamically with other modules is + making a combined work based on this library. Thus, the terms and + conditions of the GNU General Public License cover the whole + combination. + + As a special exception, the copyright holders of this library give you + permission to link this library with independent modules to produce an + executable, regardless of the license terms of these independent + modules, and to copy and distribute the resulting executable under + terms of your choice, provided that you also meet, for each linked + independent module, the terms and conditions of the license of that + module. An independent module is a module which is not derived from + or based on this library. If you modify this library, you may extend + this exception to your version of the library, but you are not + obligated to do so. If you do not wish to do so, delete this + exception statement from your version. + + Parts derived from code which carried the following notice: + + Copyright (c) 1997, 1998 by Microstar Software Ltd. + + AElfred is free for both commercial and non-commercial use and + redistribution, provided that Microstar's copyright and disclaimer are + retained intact. You are free to modify AElfred for your own use and + to redistribute AElfred with your modifications, provided that the + modifications are clearly documented. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + merchantability or fitness for a particular purpose. Please use it AT + YOUR OWN RISK. +</pre> + +<p> Some of this documentation was modified from the original +Ælfred README.txt file. All of it has been updated. </p> + +</p> + + +<h2><a name="changes">Changes Since the last Microstar Release</a></h2> + +<p> As noted above, Microstar has not updated this parser since +the summer of 1998, when it released version 1.2a on its web site. +This release is intended to benefit the developer community by +refocusing the API on SAX2, and improving conformance to the extent +that most developers should not need to use another XML parser. </p> + +<p> The code has been cleaned up (referring to the XML 1.0 spec in +all the production numbers in +comments, rather than some preliminary draft, for one example) and +has been sped up a bit as well. +JAXP support has been added, although developers are still +strongly encouraged to use the SAX2 APIs directly. </p> + + +<h3><a name="sax2">SAX2 Support</a></h3> + +<p> The original version of Ælfred did not support the +SAX2 APIs. </p> + +<p> This version supports the SAX2 APIs, exposing the standard +boolean feature descriptors. It supports the "DeclHandler" property +to provide access to all DTD declarations not already exposed +through the SAX1 API. The "LexicalHandler" property is supported, +exposing entity boundaries (including the unnamed external subset) and +things like comments and CDATA boundaries. SAX1 compatibility is +currently provided.</p> + + +<h3><a name="validation">Validation</a></h3> + +<p> In the 'pipeline' package in this same software distribution is an +<a href="../pipeline/ValidationConsumer.html">XML Validation component</a> +using any full SAX2 event stream (including all document type declarations) +to validate. There is now a <a href="XmlReader.html">XmlReader</a> class +which combines that class and this enhanced Ælfred parser, creating +an optionally validating SAX2 parser. </p> + +<p> As noted in the documentation for that validating component, certain +validity constraints can't reliably be tested by a layered validator. +These include all constraints relying on +layering violations (exposing XML at the level of tokens or below, +required since XML isn't a context-free grammar), some that +SAX2 doesn't support, and a few others. The resulting validating +parser is conformant enough for most applications that aren't doing +strange SGML tricks with DTDs. +Moreover, that validating filter can be used without +a parser ... any application component that emits SAX event streams +can DTD-validate its output on demand. </p> + +<h3><a name="smaller">You want Smaller?</a></h3> + +<p> You'll have noticed that the original version of Ælfred +had small size as a top goal. Ælfred2 normally includes a +DTD validation layer, but you can package without that. +Similarly, JAXP factory support is available but optional. +Then the main added cost due to this revision are for +supporting the SAX2 API itself; DTD validation is as +cleanly layered as allowed by SAX2.</p> + +<h3><a name="bugfixes">Bugs Fixed</a></h3> + +<p> Bugs fixed in Ælfred2 include: </p> + +<ol> + <li> Originally Ælfred didn't close file descriptors, which + led to file descriptor leakage on programs which ran for any + length of time. </li> + + <li> NOTATION declarations without system identifiers are + now handled correctly. </li> + + <li> DTD events are now reported for all invocations of a + given parser, not just the first one. </li> + + <li> More correct character handling: <ul> + + <li> Rejects out-of-range characters, both in text and in + character references. </li> + + <li> Correctly handles character references that expand to + surrogate pairs. </li> + + <li> Correctly handles UTF-8 encodings of surrogate pairs. </li> + + <li> Correctly handles Unicode 3.1 rules about illegal UTF-8 + encodings: there is only one legal encoding per character. </li> + + <li> PUBLIC identifiers are now rejected if they have illegal + characters. </li> + + <li> The parser is more correct about what characters are allowed + in names and name tokens. Uses Unicode rules (built in to Java) + rather than the voluminous XML rules, although some extensions + have been made to match XML rules more closely.</li> + + <li> Line ends are now normalized to newlines in all known + cases. </li> + + </ul></li> + + <li> Certain validity errors were previously treated as well + formedness violations. <ul> + + <li> Repeated declarations of an element type are no + longer fatal errors. </li> + + <li> Undeclared parameter entity references are no longer + fatal errors. </li> + + </ul></li> + + <li> Attribute handling is improved: <ul> + + <li> Whitespace must exist between attributes. </li> + + <li> Only one value for a given attribute is permitted. </li> + + <li> ATTLIST declarations don't need to declare attributes. </li> + + <li> Attribute values are normalized when required. </li> + + <li> Tabs in attribute values are normalized to spaces. </li> + + <li> Attribute values containing a literal "<" are rejected. </li> + + </ul></li> + + <li> More correct entity handling: <ul> + + <li> Whitespace must precede NDATA when declaring unparsed + entities.</li> + + <li> Parameter entity declarations may not have NDATA annotations. </li> + + <li> The XML specification has a bug in that it doesn't specify + that certain contexts exist within which parameter entity + expansion must not be performed. Lacking an offical erratum, + this parser now disables such expansion inside comments, + processing instructions, ignored sections, public identifiers, + and parts of entity declarations. </li> + + <li> Entity expansions that include quote characters no longer + confuse parsing of strings using such expansions. </li> + + <li> Whitespace in the values of internal entities is not mapped + to space characters. </li> + + <li> General Entity references in attribute defaults within the + DTD now cause fatal errors when the entity is not defined at the + time it is referenced. </li> + + <li> Malformed general entity references in entity declarations are + now detected. </li> + + </ul></li> + + <li> Neither conditional sections + nor parameter entity references within markup declarations + are permitted in the internal subset. </li> + + <li> Processing instructions whose target names are "XML" + (ignoring case) are now rejected. </li> + + <li> Comments may not include "--".</li> + + <li> Most "]]>" sequences in text are rejected. </li> + + <li> Correct syntax for standalone declarations is enforced. </li> + + <li> Setting a locale for diagnostics only produces an exception + if the language of that locale isn't English. </li> + + <li> Some more encoding names are recognized. These include the + Unicode 3.0 variants of UTF-16 (UTF-16BE, UTF-16LE) as well as + US-ASCII and a few commonly seen synonyms. </li> + + <li> Text (from character content, PIs, or comments) large enough + not to fit into internal buffers is now handled correctly even in + some cases which were originally handled incorrectly.</li> + + <li> Content is now reported for element types for which attributes + have been declared, but no content model is known. (Such documents + are invalid, but may still be well formed.) </li> + +</ol> + +<p> Other bugs may also have been fixed. </p> + +<p> For better overall validation support, some of the validity +constraints that can't be verified using the SAX2 event stream +are now reported directly by Ælfred2. </p> + +</body></html> + |