StAXOMBuilder.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.axiom.om.impl.builder;

import org.apache.axiom.ext.stax.DTDReader;
import org.apache.axiom.om.DeferredParsingException;
import org.apache.axiom.om.OMAbstractFactory;
import org.apache.axiom.om.OMContainer;
import org.apache.axiom.om.OMDocument;
import org.apache.axiom.om.OMElement;
import org.apache.axiom.om.OMException;
import org.apache.axiom.om.OMFactory;
import org.apache.axiom.om.OMNode;
import org.apache.axiom.om.OMXMLBuilderFactory;
import org.apache.axiom.om.impl.OMContainerEx;
import org.apache.axiom.om.impl.OMElementEx;
import org.apache.axiom.om.util.StAXUtils;
import org.apache.axiom.util.stax.XMLEventUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.Location;

import java.io.Closeable;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;

/**
 * Internal implementation class.
 */
/* Implementation note about error handling
 * ----------------------------------------
 * 
 * Usually, code that uses StAX directly just stops processing of an XML document
 * once the first parsing error has been reported. However, since Axiom
 * uses deferred parsing, and client code accesses the XML infoset using
 * an object model, things are more complicated. Indeed, if the XML
 * document is not well formed, the corresponding error might be reported
 * as a runtime exception by any call to a method of an OM node.
 * 
 * Typically the client code will have some error handling that will intercept
 * runtime exceptions and take appropriate action. Very often this error handling
 * code might want to access the object model again, for example to log the request that caused the
 * failure. This causes no problem except if the runtime exception was caused by a
 * parsing error, in which case Axiom would again try to pull events from the parser.
 * 
 * This would lead to a situation where Axiom accesses a parser that has reported a parsing
 * error before. While one would expect that after a first error reported by the parser, all
 * subsequent invocations of the parser will fail, this is not the case for all parsers
 * (at least not in all situations). Instead, the parser might be left in an inconsistent
 * state after the error. E.g. AXIOM-34 describes a case where Woodstox
 * encounters an error in XMLStreamReader#getText() but continues to return
 * (incorrect) events afterwards. The explanation for this behaviour might be that
 * the situation described here is quite uncommon when StAX is used directly (i.e. not through
 * Axiom).
 * 
 * To avoid this, the builder remembers exceptions thrown by the parser and rethrows
 * them during a call to next().
 */
public class StAXOMBuilder extends StAXBuilder {
    private static final Log log = LogFactory.getLog(StAXOMBuilder.class);
    
    private boolean doTrace = log.isDebugEnabled();
    
    /**
     * @deprecated
     */
    private static int nsCount = 0;

    // namespaceURI interning
    // default is false because most XMLStreamReader implementations don't do interning
    // due to performance impacts.  Thus a customer should not assume that a namespace
    // on an OMElement is interned.
    private boolean namespaceURIInterning = false;
    
    private int lookAheadToken = -1;
    
    /**
     * For internal use only.
     */
    public StAXOMBuilder(OMFactory ombuilderFactory, XMLStreamReader parser, Detachable detachable,
            Closeable closeable) {
        super(ombuilderFactory, parser, detachable, closeable);
    }
    
    /**
     * @deprecated Please use the {@link OMXMLBuilderFactory} API.
     */
    public StAXOMBuilder(OMFactory ombuilderFactory, XMLStreamReader parser) {
        super(ombuilderFactory, parser);
    }

    /**
     * For internal use only.
     */
    public StAXOMBuilder(OMFactory factory, 
                         XMLStreamReader parser, 
                         OMElement element, 
                         String characterEncoding) {
        // Use this constructor because the parser is passed the START_DOCUMENT state.
        super(factory, parser, characterEncoding, null, null);  
        elementLevel = 1;
        target = (OMContainerEx)element;
        populateOMElement(element);
    }
    
    /**
     * @deprecated Please use the {@link OMXMLBuilderFactory} API.
     */
    public StAXOMBuilder(OMFactory factory, XMLStreamReader parser, OMElement element) {
        this(factory, parser, element, null);
    }

    /**
     * @deprecated Please use the {@link OMXMLBuilderFactory} API.
     */
    public StAXOMBuilder(String filePath) throws XMLStreamException, FileNotFoundException {
        this(StAXUtils.createXMLStreamReader(new FileInputStream(filePath)));
    }

    /**
     * @deprecated Please use the {@link OMXMLBuilderFactory} API.
     */
    public StAXOMBuilder(XMLStreamReader parser) {
        this(OMAbstractFactory.getOMFactory(), parser);
    }

    /**
     * @deprecated Please use the {@link OMXMLBuilderFactory} API.
     */
    public StAXOMBuilder(InputStream inStream) throws XMLStreamException {
        this(StAXUtils.createXMLStreamReader(inStream));
    }

    /**
     * @deprecated
     */
    public StAXOMBuilder() {
        super();
    }

    protected OMDocument createDocument() {
        return omfactory.createOMDocument(this);
    }

    /**
     * Method next.
     *
     * @return Returns int.
     * @throws OMException
     */
    public int next() throws OMException {
        if (!cache) {
            throw new IllegalStateException("Can't process next node because caching is disabled");
        }
        // We need a loop here because we may decide to skip an event
        while (true) {
            if (done) {
                throw new OMException();
            }
            createDocumentIfNecessary();
            int token = parserNext();
            if (!cache) {
                return token;
            }
           
            // Note: if autoClose is enabled, then the parser may be null at this point
            if (doTrace && parser != null) {
                // The current token should be the same as the 
                // one just obtained.  This bit of code is used to 
                // detect invalid parser state.
                int currentParserToken = parser.getEventType();
                if (currentParserToken != token) {


                    log.debug("WARNING: The current state of the parser is not equal to the " +
                              "state just received from the parser. The current state in the paser is " +
                              XMLEventUtils.getEventTypeString(currentParserToken) + " the state just received is " +
                              XMLEventUtils.getEventTypeString(token));

                    /*
                      throw new OMException("The current token " + token + 
                                     " does not match the current event " +
                                     "reported by the parser token.  The parser did not update its state correctly.  " +
                                     "The parser is " + parser);
                     */
                }
            
                // Now log the current state of the parser
                logParserState();
            }
           
            switch (token) {
                case XMLStreamConstants.START_ELEMENT: {
                    elementLevel++;
                    OMNode node = createNextOMElement();
                    // If the node was created by a custom builder, then it will be complete;
                    // in this case, the target doesn't change
                    if (!node.isComplete()) {
                        target = (OMContainerEx)node;
                    }
                    break;
                }
                case XMLStreamConstants.CHARACTERS:
                    createOMText(XMLStreamConstants.CHARACTERS);
                    break;
                case XMLStreamConstants.CDATA:
                    createOMText(XMLStreamConstants.CDATA);
                    break;
                case XMLStreamConstants.END_ELEMENT:
                    elementLevel--;
                    endElement();
                    break;
                case XMLStreamConstants.END_DOCUMENT:
                    done = true;
                    ((OMContainerEx) this.document).setComplete(true);
                    target = null;
                    break;
                case XMLStreamConstants.SPACE:
                    createOMText(XMLStreamConstants.SPACE);
                    break;
                case XMLStreamConstants.COMMENT:
                    createComment();
                    break;
                case XMLStreamConstants.DTD:
                    createDTD();
                    break;
                case XMLStreamConstants.PROCESSING_INSTRUCTION:
                    createPI();
                    break;
                case XMLStreamConstants.ENTITY_REFERENCE:
                    createEntityReference();
                    break;
                default :
                    throw new OMException();
            }
            
            if (target == null && !done) {
                // We get here if the document has been discarded (by getDocumentElement(true)
                // or because the builder is linked to an OMSourcedElement) and
                // we just processed the END_ELEMENT event for the root element. In this case, we consume
                // the remaining events until we reach the end of the document. This serves several purposes:
                //  * It allows us to detect documents that have an epilog that is not well formed.
                //  * Many parsers will perform some cleanup when the end of the document is reached.
                //    For example, Woodstox will recycle the symbol table if the parser gets past the
                //    last END_ELEMENT. This improves performance because Woodstox by default interns
                //    all symbols; if the symbol table can be recycled, then this reduces the number of
                //    calls to String#intern().
                //  * If autoClose is set, the parser will be closed so that even more resources
                //    can be released.
                while (parserNext() != XMLStreamConstants.END_DOCUMENT) {
                    // Just loop
                }
                done = true;
            }
            
            return token;
        }
    }
    
    /**
     * Creates a new OMElement using either a CustomBuilder or 
     * the default Builder mechanism.
     * @return TODO
     */
    protected OMNode createNextOMElement() {
        OMNode newElement = null;
        if (elementLevel == 1 && this.customBuilderForPayload != null) {
            newElement = createWithCustomBuilder(customBuilderForPayload,  omfactory);
        } else if (customBuilders != null && elementLevel <= this.maxDepthForCustomBuilders) {
            String namespace = parser.getNamespaceURI();
            String localPart = parser.getLocalName();
            CustomBuilder customBuilder = getCustomBuilder(namespace, localPart);
            if (customBuilder != null) {
                newElement = createWithCustomBuilder(customBuilder, omfactory);
            }
        }
        if (newElement == null) {
            newElement = createOMElement();
        } else {
            elementLevel--; // Decrease level since custom builder read the end element event
        }
        return newElement;
    }
    
    protected OMNode createWithCustomBuilder(CustomBuilder customBuilder, OMFactory factory) {
        
        String namespace = parser.getNamespaceURI();
        if (namespace == null) {
            namespace = "";
        }
        String localPart = parser.getLocalName();
        
        if (log.isDebugEnabled()) {
            log.debug("Invoking CustomBuilder, " + customBuilder.toString() + 
                      ", to the OMNode for {" + namespace + "}" + localPart);
        }
        
        // TODO: dirty hack part 1
        // The custom builder will use addNode to insert the new node into the tree. However,
        // addNode is expected to always add the new child at the end and will attempt to
        // build the parent node. We temporarily set complete to true to avoid this.
        // There is really an incompatibility between the contract of addNode and the
        // custom builder API. This should be fixed in Axiom 1.3.
        target.setComplete(true);
        
        OMNode node = customBuilder.create(namespace, localPart, target, parser, factory);
        
        // TODO: dirty hack part 2
        target.setComplete(false);
        
        if (log.isDebugEnabled()) {
            if (node != null) {
                log.debug("The CustomBuilder, " + customBuilder.toString() + 
                          "successfully constructed the OMNode for {" + namespace + "}" + localPart);
            } else {
                log.debug("The CustomBuilder, " + customBuilder.toString() + 
                          " did not construct an OMNode for {" + namespace + "}" + localPart +
                          ". The OMNode will be constructed using the installed stax om builder");
            }
            log.debug("The current state of the parser is: ");
            logParserState();
        }
        return node;
    }
    
    /**
     * Dump the current event of the parser.
     */
    protected void logParserState() {
        if (doTrace) {
            int currentEvent = parser.getEventType();
            
            switch (currentEvent) {
            case XMLStreamConstants.START_ELEMENT:
                log.trace("START_ELEMENT: ");
                log.trace("  QName: " + parser.getName());
                break;
            case XMLStreamConstants.START_DOCUMENT:
                log.trace("START_DOCUMENT: ");
                break;
            case XMLStreamConstants.CHARACTERS:
                log.trace("CHARACTERS: ");
                // This can bust up a datahandler
                //log.trace(   "[" + parser.getText() + "]");
                break;
            case XMLStreamConstants.CDATA:
                log.trace("CDATA: ");
                // This can but
                //log.trace(   "[" + parser.getText() + "]");
                break;
            case XMLStreamConstants.END_ELEMENT:
                log.trace("END_ELEMENT: ");
                log.trace("  QName: " + parser.getName());
                break;
            case XMLStreamConstants.END_DOCUMENT:
                log.trace("END_DOCUMENT: ");
                break;
            case XMLStreamConstants.SPACE:
                log.trace("SPACE: ");
                //log.trace(   "[" + parser.getText() + "]");
                break;
            case XMLStreamConstants.COMMENT:
                log.trace("COMMENT: ");
                //log.trace(   "[" + parser.getText() + "]");
                break;
            case XMLStreamConstants.DTD:
                log.trace("DTD: ");
                log.trace(   "[" + parser.getText() + "]");
                break;
            case XMLStreamConstants.PROCESSING_INSTRUCTION:
                log.trace("PROCESSING_INSTRUCTION: ");
                log.trace("   [" + parser.getPITarget() + "][" +
                            parser.getPIData() + "]");
                break;
            case XMLStreamConstants.ENTITY_REFERENCE:
                log.trace("ENTITY_REFERENCE: ");
                log.trace("    " + parser.getLocalName() + "[" +
                            parser.getText() + "]");
                break;
            default :
                log.trace("UNKNOWN_STATE: " + currentEvent);
            
            }
        }
    }

    /**
     * Populate element with data from parser START_ELEMENT event. This is used when the source of
     * data for an element needs to be parsed on demand. The supplied element must already be set to
     * the proper name and namespace.
     *
     * @param node element to be populated
     */
    private void populateOMElement(OMElement node) {
        // create the namespaces
        processNamespaceData(node);
        // fill in the attributes
        processAttributes(node);
        Location location = parser.getLocation();
        if(location != null) {
            node.setLineNumber(location.getLineNumber());
        }
    }

    /**
     * Method createOMElement.
     *
     * @return Returns OMNode.
     * @throws OMException
     */
    // This method is not meant to be overridden. Override constructNode to create model specific OMElement instances.
    protected final OMNode createOMElement() throws OMException {
        OMElement node = constructNode(target, parser.getLocalName());
        populateOMElement(node);
        return node;
    }

    /**
     * Instantiate the appropriate {@link OMElement} implementation for the current element. This
     * method may be overridden by subclasses to support model specific {@link OMElement} types. The
     * implementation of this method is expected to initialize the {@link OMElement} with the
     * specified local name and to add it to the specified parent. However, the implementation
     * should not set the namespace of the element or process the attributes of the element. This is
     * taken care of by the caller of this method.
     * 
     * @param parent
     *            the parent for the element
     * @param elementName
     *            the local name for the element
     * @return the newly created {@link OMElement}; must not be <code>null</code>
     */
    protected OMElement constructNode(OMContainer parent, String elementName) {
        return omfactory.createOMElement(parser.getLocalName(), target, this);
    }
    
    /**
     * Method createOMText.
     *
     * @return Returns OMNode.
     * @throws OMException
     */
    protected OMNode createComment() throws OMException {
        return omfactory.createOMComment(target, parser.getText(), true);
    }

    /**
     * Method createDTD.
     *
     * @return Returns OMNode.
     * @throws OMException
     */
    protected OMNode createDTD() throws OMException {
        DTDReader dtdReader;
        try {
            dtdReader = (DTDReader)parser.getProperty(DTDReader.PROPERTY);
        } catch (IllegalArgumentException ex) {
            dtdReader = null;
        }
        if (dtdReader == null) {
            throw new OMException("Cannot create OMDocType because the XMLStreamReader doesn't support the DTDReader extension");
        }
        String internalSubset = getDTDText();
        // Woodstox returns an empty string if there is no internal subset
        if (internalSubset != null && internalSubset.length() == 0) {
            internalSubset = null;
        }
        return omfactory.createOMDocType(target, dtdReader.getRootName(), dtdReader.getPublicId(),
                dtdReader.getSystemId(), internalSubset, true);
    }
    
    /**
     * The getText() method for a DOCTYPE returns the 
     * subset of the DOCTYPE (not the direct infoset).
     * This may force the parser to get information from 
     * the network.
     * @return doctype subset
     * @throws OMException
     */
    private String getDTDText() throws OMException { 
        String text = null;
        try {
            text = parser.getText();
        } catch (RuntimeException e) {
            // Woodstox (and perhaps other parsers)
            // attempts to load the external subset even if
            // external enties is false.  So ignore this error
            // if external entity support is explicitly disabled.
            Boolean b = (Boolean) parser.getProperty(
                   XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES);
            if (b == null || b == Boolean.TRUE) {
                throw e;
            }
            if (log.isDebugEnabled()) {
                log.debug("An exception occurred while calling getText() for a DOCTYPE.  " +
                                "The exception is ignored because external " +
                                "entites support is disabled.  " +
                                "The ignored exception is " + e);
            }
        }
        return text;
    }

    /**
     * Method createPI.
     *
     * @return Returns OMNode.
     * @throws OMException
     */
    protected OMNode createPI() throws OMException {
        return omfactory.createOMProcessingInstruction(target, parser.getPITarget(), parser.getPIData(), true);
    }

    protected OMNode createEntityReference() {
        return omfactory.createOMEntityReference(target, parser.getLocalName(), parser.getText(), true);
    }
    
    private void endElement() {
        target.setComplete(true);
        if (elementLevel == 0) {
            // This is relevant for OMSourcedElements and for the case where the document has been discarded
            // using getDocumentElement(true). In these cases, this will actually set target to null. In all
            // other cases, this will have the same effect as the instruction in the else clause.
            target = (OMContainerEx)document;
        } else {
            target = (OMContainerEx)((OMElement)target).getParent();
        }
    }

    public OMElement getDocumentElement() {
        return getDocumentElement(false);
    }

    public OMElement getDocumentElement(boolean discardDocument) {
        OMElement element = getDocument().getOMDocumentElement();
        if (discardDocument) {
            ((OMElementEx)element).detachAndDiscardParent();
            document = null;
        }
        return element;
    }

    /**
     * Method processNamespaceData.
     *
     * @param node
     */
    protected void processNamespaceData(OMElement node) {
        int namespaceCount = parser.getNamespaceCount();
        for (int i = 0; i < namespaceCount; i++) {
            String prefix = parser.getNamespacePrefix(i);

            //if the namespace is not defined already when we write the start tag declare it
            // check whether this is the default namespace and make sure we have not declared that earlier
            String namespaceURI = parser.getNamespaceURI(i);
            
            if (namespaceURI == null) {
                // No need to care about interning here; String literals are always interned
                namespaceURI = "";
            } else {
                // NOTE_A:
                // By default most parsers don't intern the namespace.
                // Unfortunately the property to detect interning on the delegate parsers is hard to detect.
                // Woodstox has a proprietary property on the XMLInputFactory.
                // IBM has a proprietary property on the XMLStreamReader.
                // For now only force the interning if requested.
                if (isNamespaceURIInterning()) {
                    namespaceURI = namespaceURI.intern();
                }
            }
            
            if (prefix == null) {
                prefix = "";
            }
            
            ((OMElementEx)node).addNamespaceDeclaration(namespaceURI, prefix);
        }

        // set the own namespace
        String namespaceURI = parser.getNamespaceURI();
        String prefix = parser.getPrefix();

        // See NOTE_A above
        BuilderUtil.setNamespace(node, namespaceURI, prefix, isNamespaceURIInterning());
    }

    /**
     * @param doDebug
     * @deprecated
     */
    public void setDoDebug(boolean doDebug) {
        this.doTrace = doDebug;
    }

    /**
     * @deprecated A builder doesn't need to generate prefixes.
     */
    protected String createPrefix() {
        return "ns" + nsCount++;
    }

    /**
     * Set namespace uri interning
     * @param b
     */
    public void setNamespaceURIInterning(boolean b) {
        this.namespaceURIInterning = b;
    }
    
    /**
     * @return if namespace uri interning 
     */
    public boolean isNamespaceURIInterning() {
        return this.namespaceURIInterning;
    }
    
    /**
     * Pushes the virtual parser ahead one token.
     * If a look ahead token was calculated it is returned.
     * @return next token
     * @throws DeferredParsingException
     */
    int parserNext() {
        if (lookAheadToken >= 0) {
            if (log.isDebugEnabled()) {
                log.debug("Consuming look-ahead token " + XMLEventUtils.getEventTypeString(lookAheadToken));
            }
            int token = lookAheadToken;
            lookAheadToken = -1; // Reset
            return token;
        } else {
            try {
                if (parserException != null) {
                    log.warn("Attempt to access a parser that has thrown a parse exception before; " +
                    		"rethrowing the original exception.");
                    if (parserException instanceof XMLStreamException) {
                        throw (XMLStreamException)parserException;
                    } else {
                        throw (RuntimeException)parserException;
                    }
                }
                int event;
                try {
                    event = parser.next();
                } catch (XMLStreamException ex) {
                    parserException = ex;
                    throw ex;
                }
                if (event == XMLStreamConstants.END_DOCUMENT) {
                    if (cache && elementLevel != 0) {
                        throw new OMException("Unexpected END_DOCUMENT event");
                    }
                    if (autoClose) {
                        close();
                    }
                }
                return event;
            } catch (XMLStreamException ex) {
                throw new DeferredParsingException(ex);
            }
        }
    }
    
    /**
     * This method looks ahead to the next start element.
     * @return true if successful
     */
    public boolean lookahead()  {
        while (true) {
            if (lookAheadToken < 0) {
                lookAheadToken = parserNext();
            }
            if (lookAheadToken == XMLStreamConstants.START_ELEMENT) {
                log.debug("Performing look-ahead; START_ELEMENT found");
                return true;
            } else if (lookAheadToken == XMLStreamConstants.END_ELEMENT ||
                    lookAheadToken == XMLStreamConstants.START_DOCUMENT ||
                    lookAheadToken == XMLStreamConstants.END_DOCUMENT) {
                if (log.isDebugEnabled()) {
                    log.debug("Performing look-ahead; " + XMLEventUtils.getEventTypeString(lookAheadToken) + " found");
                }
                next();
                return false;  // leaving scope...start element not found
            } else {
                next();  // continue looking past whitespace etc.
            }
        }
    }
    
    /**
     * Check if the node for the current token has already been created or if the parser is ahead
     * of the builder.
     * 
     * @return A return value of <code>true</code> indicates that the parser is one token ahead
     *         of the builder, i.e. that the node for the current token has not been created yet.
     *         This state can only be reached by a call to {@link #lookahead()}, and the
     *         current token is always a {@link XMLStreamConstants#START_ELEMENT START_ELEMENT}.
     *         The information related to that element can be obtained by calls to
     *         {@link #getName()}, {@link #getNamespace()}, {@link #getPrefix()},
     *         {@link #getAttributeCount()}, {@link #getAttributeName(int)},
     *         {@link #getAttributeNamespace(int)}, {@link #getAttributePrefix(int)},
     *         {@link #getNamespaceCount()}, {@link #getNamespacePrefix(int)} and
     *         {@link #getNamespaceUri(int)}.
     *         <p>
     *         A return value of <code>false</code> indicates that the node corresponding to the
     *         current token hold by the parser has already been created.
     */
    public boolean isLookahead() {
        return lookAheadToken >= 0;
    }
}