ohair@286: /* ohair@286: * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. ohair@286: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ohair@286: * ohair@286: * This code is free software; you can redistribute it and/or modify it ohair@286: * under the terms of the GNU General Public License version 2 only, as ohair@286: * published by the Free Software Foundation. Oracle designates this ohair@286: * particular file as subject to the "Classpath" exception as provided ohair@286: * by Oracle in the LICENSE file that accompanied this code. ohair@286: * ohair@286: * This code is distributed in the hope that it will be useful, but WITHOUT ohair@286: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ohair@286: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ohair@286: * version 2 for more details (a copy is included in the LICENSE file that ohair@286: * accompanied this code). ohair@286: * ohair@286: * You should have received a copy of the GNU General Public License version ohair@286: * 2 along with this work; if not, write to the Free Software Foundation, ohair@286: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ohair@286: * ohair@286: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ohair@286: * or visit www.oracle.com if you need additional information or have any ohair@286: * questions. ohair@286: */ ohair@286: ohair@286: package com.sun.xml.internal.dtdparser; ohair@286: ohair@286: import org.xml.sax.EntityResolver; ohair@286: import org.xml.sax.InputSource; ohair@286: ohair@286: import java.io.File; ohair@286: import java.io.FileInputStream; ohair@286: import java.io.IOException; ohair@286: import java.io.InputStream; ohair@286: import java.net.URL; ohair@286: import java.net.URLConnection; ohair@286: import java.util.Hashtable; ohair@286: ohair@286: /** ohair@286: * This entity resolver class provides a number of utilities which can help ohair@286: * managment of external parsed entities in XML. These are commonly used ohair@286: * to hold markup declarations that are to be used as part of a Document ohair@286: * Type Declaration (DTD), or to hold text marked up with XML. ohair@286: *

ohair@286: *

Features include:

ohair@286: *

ohair@286: *

Subclasses can perform tasks such as supporting new URI schemes for ohair@286: * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing ohair@286: * MIME entities which are part of a multipart/related group ohair@286: * (see RFC 2387). They may also be used to support particular catalog ohair@286: * syntaxes, such as the ohair@286: * SGML/Open Catalog (SOCAT) which supports the SGML notion of "Formal ohair@286: * Public Identifiers (FPIs). ohair@286: * ohair@286: * @author David Brownell ohair@286: * @author Janet Koenig ohair@286: * @version 1.3 00/02/24 ohair@286: */ ohair@286: public class Resolver implements EntityResolver { ohair@286: private boolean ignoringMIME; ohair@286: ohair@286: // table mapping public IDs to (local) URIs ohair@286: private Hashtable id2uri; ohair@286: ohair@286: // tables mapping public IDs to resources and classloaders ohair@286: private Hashtable id2resource; ohair@286: private Hashtable id2loader; ohair@286: ohair@286: // ohair@286: // table of MIME content types (less attributes!) known ohair@286: // to be mostly "OK" to use with XML MIME entities. the ohair@286: // idea is to rule out obvious braindamage ("image/jpg") ohair@286: // not the subtle stuff ("text/html") that might actually ohair@286: // be (or become) safe. ohair@286: // ohair@286: private static final String types [] = { ohair@286: "application/xml", ohair@286: "text/xml", ohair@286: "text/plain", ohair@286: "text/html", // commonly mis-inferred ohair@286: "application/x-netcdf", // this is often illegal XML ohair@286: "content/unknown" ohair@286: }; ohair@286: ohair@286: /** ohair@286: * Constructs a resolver. ohair@286: */ ohair@286: public Resolver() { ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns an input source, using the MIME type information and URL ohair@286: * scheme to statically determine the correct character encoding if ohair@286: * possible and otherwise autodetecting it. MIME carefully specifies ohair@286: * the character encoding defaults, and how attributes of the content ohair@286: * type can change it. XML further specifies two mandatory encodings ohair@286: * (UTF-8 and UTF-16), and includes an XML declaration which can be ohair@286: * used to internally label most documents encoded using US-ASCII ohair@286: * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and ohair@286: * more). ohair@286: *

ohair@286: *

This method can be used to access XML documents which do not ohair@286: * have URIs (such as servlet input streams, or most JavaMail message ohair@286: * entities) and to support access methods such as HTTP POST or PUT. ohair@286: * (URLs normally return content using the GET method.) ohair@286: *

ohair@286: *

The caller should set the system ID in order for relative URIs ohair@286: * found in this document to be interpreted correctly. In some cases, ohair@286: * a custom resolver will need to be used; for example, documents ohair@286: * may be grouped in a single MIME "multipart/related" bundle, and ohair@286: * relative URLs would refer to other documents in that bundle. ohair@286: * ohair@286: * @param contentType The MIME content type for the source for which ohair@286: * an InputSource is desired, such as text/xml;charset=utf-8. ohair@286: * @param stream The input byte stream for the input source. ohair@286: * @param checkType If true, this verifies that the content type is known ohair@286: * to support XML documents, such as application/xml. ohair@286: * @param scheme Unless this is "file", unspecified MIME types ohair@286: * default to US-ASCII. Files are always autodetected since most ohair@286: * file systems discard character encoding information. ohair@286: */ ohair@286: public static InputSource createInputSource(String contentType, ohair@286: InputStream stream, ohair@286: boolean checkType, ohair@286: String scheme) throws IOException { ohair@286: InputSource retval; ohair@286: String charset = null; ohair@286: ohair@286: if (contentType != null) { ohair@286: int index; ohair@286: ohair@286: contentType = contentType.toLowerCase(); ohair@286: index = contentType.indexOf(';'); ohair@286: if (index != -1) { ohair@286: String attributes; ohair@286: ohair@286: attributes = contentType.substring(index + 1); ohair@286: contentType = contentType.substring(0, index); ohair@286: ohair@286: // use "charset=..." if it's available ohair@286: index = attributes.indexOf("charset"); ohair@286: if (index != -1) { ohair@286: attributes = attributes.substring(index + 7); ohair@286: // strip out subsequent attributes ohair@286: if ((index = attributes.indexOf(';')) != -1) ohair@286: attributes = attributes.substring(0, index); ohair@286: // find start of value ohair@286: if ((index = attributes.indexOf('=')) != -1) { ohair@286: attributes = attributes.substring(index + 1); ohair@286: // strip out rfc822 comments ohair@286: if ((index = attributes.indexOf('(')) != -1) ohair@286: attributes = attributes.substring(0, index); ohair@286: // double quotes are optional ohair@286: if ((index = attributes.indexOf('"')) != -1) { ohair@286: attributes = attributes.substring(index + 1); ohair@286: attributes = attributes.substring(0, ohair@286: attributes.indexOf('"')); ohair@286: } ohair@286: charset = attributes.trim(); ohair@286: // XXX "\;", "\)" etc were mishandled above ohair@286: } ohair@286: } ohair@286: } ohair@286: ohair@286: // ohair@286: // Check MIME type. ohair@286: // ohair@286: if (checkType) { ohair@286: boolean isOK = false; ohair@286: for (int i = 0; i < types.length; i++) ohair@286: if (types[i].equals(contentType)) { ohair@286: isOK = true; ohair@286: break; ohair@286: } ohair@286: if (!isOK) ohair@286: throw new IOException("Not XML: " + contentType); ohair@286: } ohair@286: ohair@286: // ohair@286: // "text/*" MIME types have hard-wired character set ohair@286: // defaults, as specified in the RFCs. For XML, we ohair@286: // ignore the system "file.encoding" property since ohair@286: // autodetection is more correct. ohair@286: // ohair@286: if (charset == null) { ohair@286: contentType = contentType.trim(); ohair@286: if (contentType.startsWith("text/")) { ohair@286: if (!"file".equalsIgnoreCase(scheme)) ohair@286: charset = "US-ASCII"; ohair@286: } ohair@286: // "application/*" has no default ohair@286: } ohair@286: } ohair@286: ohair@286: retval = new InputSource(XmlReader.createReader(stream, charset)); ohair@286: retval.setByteStream(stream); ohair@286: retval.setEncoding(charset); ohair@286: return retval; ohair@286: } ohair@286: ohair@286: ohair@286: /** ohair@286: * Creates an input source from a given URI. ohair@286: * ohair@286: * @param uri the URI (system ID) for the entity ohair@286: * @param checkType if true, the MIME content type for the entity ohair@286: * is checked for document type and character set encoding. ohair@286: */ ohair@286: static public InputSource createInputSource(URL uri, boolean checkType) ohair@286: throws IOException { ohair@286: ohair@286: URLConnection conn = uri.openConnection(); ohair@286: InputSource retval; ohair@286: ohair@286: if (checkType) { ohair@286: String contentType = conn.getContentType(); ohair@286: retval = createInputSource(contentType, conn.getInputStream(), ohair@286: false, uri.getProtocol()); ohair@286: } else { ohair@286: retval = new InputSource(XmlReader.createReader(conn.getInputStream())); ohair@286: } ohair@286: retval.setSystemId(conn.getURL().toString()); ohair@286: return retval; ohair@286: } ohair@286: ohair@286: ohair@286: /** ohair@286: * Creates an input source from a given file, autodetecting ohair@286: * the character encoding. ohair@286: */ ohair@286: static public InputSource createInputSource(File file) ohair@286: throws IOException { ohair@286: InputSource retval; ohair@286: String path; ohair@286: ohair@286: retval = new InputSource(XmlReader.createReader(new FileInputStream(file))); ohair@286: ohair@286: // On JDK 1.2 and later, simplify this: ohair@286: // "path = file.toURL ().toString ()". ohair@286: path = file.getAbsolutePath(); ohair@286: if (File.separatorChar != '/') ohair@286: path = path.replace(File.separatorChar, '/'); ohair@286: if (!path.startsWith("/")) ohair@286: path = "/" + path; ohair@286: if (!path.endsWith("/") && file.isDirectory()) ohair@286: path = path + "/"; ohair@286: ohair@286: retval.setSystemId("file:" + path); ohair@286: return retval; ohair@286: } ohair@286: ohair@286: ohair@286: /** ohair@286: * SAX: ohair@286: * Resolve the given entity into an input source. If the name can't ohair@286: * be mapped to a preferred form of the entity, the URI is used. To ohair@286: * resolve the entity, first a local catalog mapping names to URIs is ohair@286: * consulted. If no mapping is found there, a catalog mapping names ohair@286: * to java resources is consulted. Finally, if neither mapping found ohair@286: * a copy of the entity, the specified URI is used. ohair@286: *

ohair@286: *

When a URI is used, ohair@286: * createInputSource is used to correctly deduce the character ohair@286: * encoding used by this entity. No MIME type checking is done. ohair@286: * ohair@286: * @param name Used to find alternate copies of the entity, when ohair@286: * this value is non-null; this is the XML "public ID". ohair@286: * @param uri Used when no alternate copy of the entity is found; ohair@286: * this is the XML "system ID", normally a URI. ohair@286: */ ohair@286: public InputSource resolveEntity(String name, String uri) ohair@286: throws IOException { ohair@286: InputSource retval; ohair@286: String mappedURI = name2uri(name); ohair@286: InputStream stream; ohair@286: ohair@286: // prefer explicit URI mappings, then bundled resources... ohair@286: if (mappedURI == null && (stream = mapResource(name)) != null) { ohair@286: uri = "java:resource:" + (String) id2resource.get(name); ohair@286: retval = new InputSource(XmlReader.createReader(stream)); ohair@286: ohair@286: // ...and treat all URIs the same (as URLs for now). ohair@286: } else { ohair@286: URL url; ohair@286: URLConnection conn; ohair@286: ohair@286: if (mappedURI != null) ohair@286: uri = mappedURI; ohair@286: else if (uri == null) ohair@286: return null; ohair@286: ohair@286: url = new URL(uri); ohair@286: conn = url.openConnection(); ohair@286: uri = conn.getURL().toString(); ohair@286: // System.out.println ("++ URI: " + url); ohair@286: if (ignoringMIME) ohair@286: retval = new InputSource(XmlReader.createReader(conn.getInputStream())); ohair@286: else { ohair@286: String contentType = conn.getContentType(); ohair@286: retval = createInputSource(contentType, ohair@286: conn.getInputStream(), ohair@286: false, url.getProtocol()); ohair@286: } ohair@286: } ohair@286: retval.setSystemId(uri); ohair@286: retval.setPublicId(name); ohair@286: return retval; ohair@286: } ohair@286: ohair@286: ohair@286: /** ohair@286: * Returns true if this resolver is ignoring MIME types in the documents ohair@286: * it returns, to work around bugs in how servers have reported the ohair@286: * documents' MIME types. ohair@286: */ ohair@286: public boolean isIgnoringMIME() { ohair@286: return ignoringMIME; ohair@286: } ohair@286: ohair@286: /** ohair@286: * Tells the resolver whether to ignore MIME types in the documents it ohair@286: * retrieves. Many web servers incorrectly assign text documents a ohair@286: * default character encoding, even when that is incorrect. For example, ohair@286: * all HTTP text documents default to use ISO-8859-1 (used for Western ohair@286: * European languages), and other MIME sources default text documents ohair@286: * to use US-ASCII (a seven bit encoding). For XML documents which ohair@286: * include text encoding declarations (as most should do), these server ohair@286: * bugs can be worked around by ignoring the MIME type entirely. ohair@286: */ ohair@286: public void setIgnoringMIME(boolean value) { ohair@286: ignoringMIME = value; ohair@286: } ohair@286: ohair@286: ohair@286: // maps the public ID to an alternate URI, if one is registered ohair@286: private String name2uri(String publicId) { ohair@286: if (publicId == null || id2uri == null) ohair@286: return null; ohair@286: return (String) id2uri.get(publicId); ohair@286: } ohair@286: ohair@286: ohair@286: /** ohair@286: * Registers the given public ID as corresponding to a particular ohair@286: * URI, typically a local copy. This URI will be used in preference ohair@286: * to ones provided as system IDs in XML entity declarations. This ohair@286: * mechanism would most typically be used for Document Type Definitions ohair@286: * (DTDs), where the public IDs are formally managed and versioned. ohair@286: * ohair@286: * @param publicId The managed public ID being mapped ohair@286: * @param uri The URI of the preferred copy of that entity ohair@286: */ ohair@286: public void registerCatalogEntry(String publicId, ohair@286: String uri) { ohair@286: if (id2uri == null) ohair@286: id2uri = new Hashtable(17); ohair@286: id2uri.put(publicId, uri); ohair@286: } ohair@286: ohair@286: ohair@286: // return the resource as a stream ohair@286: private InputStream mapResource(String publicId) { ohair@286: // System.out.println ("++ PUBLIC: " + publicId); ohair@286: if (publicId == null || id2resource == null) ohair@286: return null; ohair@286: ohair@286: String resourceName = (String) id2resource.get(publicId); ohair@286: ClassLoader loader = null; ohair@286: ohair@286: if (resourceName == null) ohair@286: return null; ohair@286: // System.out.println ("++ Resource: " + resourceName); ohair@286: ohair@286: if (id2loader != null) ohair@286: loader = (ClassLoader) id2loader.get(publicId); ohair@286: // System.out.println ("++ Loader: " + loader); ohair@286: if (loader == null) ohair@286: return ClassLoader.getSystemResourceAsStream(resourceName); ohair@286: return loader.getResourceAsStream(resourceName); ohair@286: } ohair@286: ohair@286: /** ohair@286: * Registers a given public ID as corresponding to a particular Java ohair@286: * resource in a given class loader, typically distributed with a ohair@286: * software package. This resource will be preferred over system IDs ohair@286: * included in XML documents. This mechanism should most typically be ohair@286: * used for Document Type Definitions (DTDs), where the public IDs are ohair@286: * formally managed and versioned. ohair@286: *

ohair@286: *

If a mapping to a URI has been provided, that mapping takes ohair@286: * precedence over this one. ohair@286: * ohair@286: * @param publicId The managed public ID being mapped ohair@286: * @param resourceName The name of the Java resource ohair@286: * @param loader The class loader holding the resource, or null if ohair@286: * it is a system resource. ohair@286: */ ohair@286: public void registerCatalogEntry(String publicId, ohair@286: String resourceName, ohair@286: ClassLoader loader) { ohair@286: if (id2resource == null) ohair@286: id2resource = new Hashtable(17); ohair@286: id2resource.put(publicId, resourceName); ohair@286: ohair@286: if (loader != null) { ohair@286: if (id2loader == null) ohair@286: id2loader = new Hashtable(17); ohair@286: id2loader.put(publicId, loader); ohair@286: } ohair@286: } ohair@286: }