/* Copyright (c) 2008 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.gdata.util; import com.google.gdata.util.common.xml.XmlNamespace; import com.google.gdata.util.common.xml.XmlWriter; import com.google.gdata.util.common.xml.parsing.SecureGenericXMLFactory; import com.google.gdata.client.CoreErrorDomain; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.ParserAdapter; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringWriter; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.Stack; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; /** * XML parser. *
* This is a thin layer on top of a SAX parser. The key concept * necessary to understand this parser is Element Handler. * Element handlers are type-specific parsers. Each handler instance * contains an instance of the Java type corresponding to the XML * type it parses. At any given time, one handler is active, and zero * or more handlers are kept on the stack. This corresponds directly * to the set of currently opened XML tags. *
* To use this parser, one must define an {@link * XmlParser.ElementHandler} type (usually one per XML schema type), * specify the root element handler, and pass a reader to the * {@link #parse(Reader, com.google.gdata.util.XmlParser.ElementHandler, String, * String)} method. *
* * * * @see XmlParser.ElementHandler */ public class XmlParser extends DefaultHandler { private static final Logger logger = Logger.getLogger(XmlParser.class.getName()); // The SAXParserFactory used to create underlying SAXParser instances. private static SAXParserFactory parserFactory; // Always return secure SAX parser, which is secured against XXE attacks private static SAXParserFactory getSAXParserFactory() throws ParserConfigurationException, SAXException { SAXParserFactory factory; try { factory = SecureGenericXMLFactory.getSAXParserFactory( SAXParserFactory.newInstance()); // "http://xml.org/sax/features/external-parameter-entities" is a feature // that is set by the SecureGenericXMLFactory and not supported on some // platform such as Android. // Unfortunately, the Android implementation doesn't throw an exception // when setting the feature. The exception is thrown when a new SAXParser // is instantiated in the newSAXParser method. // The following line check for such behavior. factory.newSAXParser(); } catch (ParserConfigurationException e) { // OK. Cannot create secure xml parser. Use insecure one. // The factory instantiated in the try block can't be reused due to // side-effect in SecureGenericXMLFactory.getSAXParserFactory. factory = SAXParserFactory.newInstance(); } factory.setNamespaceAware(true); return factory; } /** * Base class for custom element handlers. *
* To implement a new element handler, one must create a new class * extending this class, override {@link #getChildHandler} if nested * elements need to be parsed, override {@link #processAttribute} if * attributes need to be parsed, and override {@link * #processEndElement()} to receive the text() value and post-process * the element. *
* If the handler wishes to store unrecognized XML contents in an {@link * XmlBlob} value, it must call {@link #initializeXmlBlob} either in the * constructor, in parent's {@link #getChildHandler}, or in {@link * #processAttribute}. The resulting {@link XmlBlob} value is available * following the invocation of {@link #processEndElement()} * through the object passed to {@link #initializeXmlBlob}. *
*
* This class implements overridable methods to support unrecognized XML
* parsing if desired.
*
*
*/
public static class ElementHandler {
/** This element's QName. Used for error reporting. */
public String qName;
/** This element's text() value. */
public String value;
/** Temporary buffer for building up the text() value. */
private StringBuilder buffer;
/**
* The current state of {@code xml:lang}.
* See http://www.w3.org/TR/REC-xml/#sec-lang-tag for more information.
*/
public String xmlLang;
/**
* The current state of {@code xml:base}.
* See http://www.cafeconleche.org/books/xmljava/chapters/ch03s03.html for
* more information.
*/
public String xmlBase;
/** Keeps track of the element stack. */
ElementHandler parent;
/**
* If the handler is parsing unrecognized XML, this object stores the
* output.
*/
XmlBlob xmlBlob = null;
/**
* Flag indicating whether it's still OK to call {@link #initializeXmlBlob}.
*/
boolean okToInitializeXmlBlob = true;
/** Flag indicating whether mixed content unrecognized XML is allowed. */
boolean mixedContent = false;
/**
* Flag indicating whether unrecognized XML should be processed for
* full-text indexing. If set, the resulting string ready for indexing is
* stored in {@link XmlBlob#fullText}. Non-contiguous strings within this
* index are separated by '\n'.
*/
boolean fullTextIndex = false;
/** This element's inner XML writer. Used internally by XmlParser. */
XmlWriter innerXml;
/** Namespaces used by this blob. */
Set
*
* The default implementation doesn't recognize anything. The result is a
* schema error unless the parent handler accepts unrecognized XML.
*
* {@link com.google.gdata.wireformats.XmlParser}.
* localname/namespace.
*
* @param namespace
* Child element's namespace URI.
*
* @param qualifiedName
* Child element's qualified name.
*
* @param localName
* Child element's local name.
*
* @param attrs
* Child element's attributes. These attributes will be
* communicated to the child element handler through its
* {@link #processAttribute} method. They are passed here because
* sometimes the value of some attribute determines the element's
* content type, so different element handlers may be needed.
*
* @return Child element's handler, or {@code null} if the child is
* unrecognized.
*
* @throws ParseException
* Invalid child element.
*
* @throws IOException
* Internal I/O exception (e.g., thrown by XML blob writer).
*/
public ElementHandler getChildHandler(String namespace,
String qualifiedName,
String localName,
Attributes attrs,
List
*
* The default implementation doesn't recognize anything. The result is a
* schema error unless the parent handler accepts unrecognized XML.
*
* {@link com.google.gdata.wireformats.XmlParser}.
* localname/namespace.
*
* @param namespace
* Child element namespace URI.
*
* @param localName
* Child element name.
*
* @param attrs
* Child element attributes. These attributes will be
* communicated to the child element handler through its
* {@link #processAttribute} method. They are passed here because
* sometimes the value of some attribute determines the element's
* content type, so different element handlers may be needed.
*
* @return Child element handler, or {@code null} if the child is
* unrecognized.
*
* @throws ParseException
* Invalid child element.
*
* @throws IOException
* Internal I/O exception (e.g., thrown by XML blob writer).
*/
public ElementHandler getChildHandler(String namespace,
String localName,
Attributes attrs)
throws ParseException, IOException {
if (xmlBlob == null) {
ParseException pe = new ParseException(
CoreErrorDomain.ERR.unrecognizedElement);
pe.setInternalReason("Unrecognized element '" + localName + "'.");
throw pe;
} else {
logger.fine("No child handler for " + localName +
". Treating as arbitrary foreign XML.");
return null;
}
}
/**
* Called to process an attribute. Designed to be overridden by derived
* classes.
*
* @param namespace
* Attribute namespace URI.
*
* @param qualifiedName
* Attribute's qualified name.
*
* @param localName
* Attribute's local name.
*
* @param attrValue
* Attribute value.
*
* @throws ParseException
* Invalid attribute.
*/
public void processAttribute(String namespace,
String qualifiedName,
String localName,
String attrValue) throws ParseException {
processAttribute(namespace, localName, attrValue);
}
/**
* Called to process an attribute. Designed to be overridden by derived
* classes.
*
* @param namespace
* Attribute namespace URI.
*
* @param localName
* Attribute name.
*
* @param value
* Attribute value.
*
* @throws ParseException
* Invalid attribute.
*/
public void processAttribute(String namespace,
String localName,
String value) throws ParseException {}
/**
* Called to process this element when the closing tag is encountered.
* The default implementation refuses to accept text() content, unless
* the handler is configured to accept unrecognized XML with mixed content.
*/
public void processEndElement() throws ParseException {
if (value != null && !value.trim().equals("") && !mixedContent) {
throw new ParseException(
CoreErrorDomain.ERR.textNotAllowed);
}
}
/**
* If a derived class wishes to retrieve all unrecognized XML in a blob,
* it calls this method. It must be called in the constructor, in
* the parent element handler, or in {@link #processAttribute}.
*
* @param xmlBlob
* Supplies the XML blob that stores the resulting XML.
*
* @param mixedContent
* Specifies that the handler accepts mixed content XML.
*
* @param fullTextIndex
* Flag indicating whether unrecognized XML should be processed
* for full-text indexing. If set, the resulting string ready for
* indexing is stored in {@link XmlBlob#fullText}.
*/
public void initializeXmlBlob(XmlBlob xmlBlob,
boolean mixedContent,
boolean fullTextIndex) {
assert okToInitializeXmlBlob;
this.xmlBlob = xmlBlob;
this.mixedContent = mixedContent;
this.innerXmlStringWriter = new StringWriter();
try {
this.innerXml = new XmlWriter(innerXmlStringWriter);
// The XmlWriter constructor doesn't actually throw an IOException, so
// once that constructor is fixed we can remove this catch block.
} catch (IOException impossible) {
throw new AssertionError(impossible);
}
this.fullTextIndex = fullTextIndex;
if (fullTextIndex) {
this.fullTextIndexWriter = new StringWriter();
}
}
/**
* Utility routine that combines the current state of {@code xml:base}
* with the specified URI to obtain an absolute URI.
*
*
* See http://www.cafeconleche.org/books/xmljava/chapters/ch03s03.html
* for more information.
*
* @param uriValue
* URI to be interpreted in the context of {@code xml:base}.
*
* @return Corresponding absolute URI.
*
* @throws ParseException
* Invalid URI.
*/
public String getAbsoluteUri(String uriValue) throws ParseException {
try {
return getCumulativeXmlBase(xmlBase, uriValue);
} catch (URISyntaxException e) {
throw new ParseException(e.getMessage());
}
}
/**
* Utility method to return the value of an xsd:boolean attribute.
*
* @param attrs
* Elements attributes to test against.
*
* @param attrName
* Attribute name.
*
* @return the Boolean value if the attribute is present, or
* {@code null} otherwise.
*
* @throws ParseException if attribute value is not valid xsd:boolean.
*/
public Boolean getBooleanAttribute(Attributes attrs, String attrName)
throws ParseException {
Boolean result = null;
String value = attrs.getValue("", attrName);
try {
result = parseBooleanValue(value);
} catch (ParseException ex) {
ParseException pe = new ParseException(
CoreErrorDomain.ERR.invalidAttributeValue);
pe.setInternalReason("Invalid value for " + attrName +
" attribute: " + value);
throw pe;
}
return result;
}
/**
* Utility method to parse provided xsd:boolean value.
*
* @param value
* xsd:boolean value to parse
*
* @return the Boolean value or {@code null}
*
* @throws ParseException if value is not valid xsd:boolean.
*/
protected Boolean parseBooleanValue(String value)
throws ParseException {
if (value == null) {
return null;
}
if (value.equalsIgnoreCase("false") || value.equals("0")) {
return Boolean.FALSE;
}
if (value.equalsIgnoreCase("true") || value.equals("1")) {
return Boolean.TRUE;
}
ParseException pe = new ParseException(
CoreErrorDomain.ERR.invalidBooleanAttribute);
pe.setInternalReason("Invalid value for boolean attribute: " + value);
throw pe;
}
}
/**
* Root element handler.
*/
protected ElementHandler rootHandler;
/** Root element namespace URI. */
protected String rootNamespace;
/** Root element name. */
protected String rootElementName;
/** Top of the element handler stack. */
ElementHandler curHandler;
/** Number of unrecognized elements on the stack. */
int unrecognizedElements = 0;
/** Document locator used to get line and column numbers for SAX events. */
Locator locator;
/**
* Used to track namespace declarations seen within the current parse
* stream.
*/
private static class NamespaceDecl {
private NamespaceDecl(XmlNamespace ns) {
this.ns = ns;
}
/** The declared namespace */
XmlNamespace ns;
/**
* {@code true} if the namespace declaration occurs inside an XmlBlob.
*/
boolean inBlob;
}
/**
* Set of all namespace declarations valid at the current location.
* Includes namespace declarations from all ancestor elements.
*/
protected HashMap