aoqi@0: /* aoqi@0: * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. aoqi@0: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. aoqi@0: * aoqi@0: * This code is free software; you can redistribute it and/or modify it aoqi@0: * under the terms of the GNU General Public License version 2 only, as aoqi@0: * published by the Free Software Foundation. Oracle designates this aoqi@0: * particular file as subject to the "Classpath" exception as provided aoqi@0: * by Oracle in the LICENSE file that accompanied this code. aoqi@0: * aoqi@0: * This code is distributed in the hope that it will be useful, but WITHOUT aoqi@0: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or aoqi@0: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License aoqi@0: * version 2 for more details (a copy is included in the LICENSE file that aoqi@0: * accompanied this code). aoqi@0: * aoqi@0: * You should have received a copy of the GNU General Public License version aoqi@0: * 2 along with this work; if not, write to the Free Software Foundation, aoqi@0: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. aoqi@0: * aoqi@0: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA aoqi@0: * or visit www.oracle.com if you need additional information or have any aoqi@0: * questions. aoqi@0: */ aoqi@0: aoqi@0: package com.sun.xml.internal.dtdparser; aoqi@0: aoqi@0: import org.xml.sax.EntityResolver; aoqi@0: import org.xml.sax.InputSource; aoqi@0: aoqi@0: import java.io.File; aoqi@0: import java.io.FileInputStream; aoqi@0: import java.io.IOException; aoqi@0: import java.io.InputStream; aoqi@0: import java.net.URL; aoqi@0: import java.net.URLConnection; aoqi@0: import java.util.Hashtable; aoqi@0: aoqi@0: /** aoqi@0: * This entity resolver class provides a number of utilities which can help aoqi@0: * managment of external parsed entities in XML. These are commonly used aoqi@0: * to hold markup declarations that are to be used as part of a Document aoqi@0: * Type Declaration (DTD), or to hold text marked up with XML. aoqi@0: *

aoqi@0: *

Features include:

aoqi@0: *

aoqi@0: *

Subclasses can perform tasks such as supporting new URI schemes for aoqi@0: * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing aoqi@0: * MIME entities which are part of a multipart/related group aoqi@0: * (see RFC 2387). They may also be used to support particular catalog aoqi@0: * syntaxes, such as the aoqi@0: * SGML/Open Catalog (SOCAT) which supports the SGML notion of "Formal aoqi@0: * Public Identifiers (FPIs). aoqi@0: * aoqi@0: * @author David Brownell aoqi@0: * @author Janet Koenig aoqi@0: * @version 1.3 00/02/24 aoqi@0: */ aoqi@0: public class Resolver implements EntityResolver { aoqi@0: private boolean ignoringMIME; aoqi@0: aoqi@0: // table mapping public IDs to (local) URIs aoqi@0: private Hashtable id2uri; aoqi@0: aoqi@0: // tables mapping public IDs to resources and classloaders aoqi@0: private Hashtable id2resource; aoqi@0: private Hashtable id2loader; aoqi@0: aoqi@0: // aoqi@0: // table of MIME content types (less attributes!) known aoqi@0: // to be mostly "OK" to use with XML MIME entities. the aoqi@0: // idea is to rule out obvious braindamage ("image/jpg") aoqi@0: // not the subtle stuff ("text/html") that might actually aoqi@0: // be (or become) safe. aoqi@0: // aoqi@0: private static final String types [] = { aoqi@0: "application/xml", aoqi@0: "text/xml", aoqi@0: "text/plain", aoqi@0: "text/html", // commonly mis-inferred aoqi@0: "application/x-netcdf", // this is often illegal XML aoqi@0: "content/unknown" aoqi@0: }; aoqi@0: aoqi@0: /** aoqi@0: * Constructs a resolver. aoqi@0: */ aoqi@0: public Resolver() { aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Returns an input source, using the MIME type information and URL aoqi@0: * scheme to statically determine the correct character encoding if aoqi@0: * possible and otherwise autodetecting it. MIME carefully specifies aoqi@0: * the character encoding defaults, and how attributes of the content aoqi@0: * type can change it. XML further specifies two mandatory encodings aoqi@0: * (UTF-8 and UTF-16), and includes an XML declaration which can be aoqi@0: * used to internally label most documents encoded using US-ASCII aoqi@0: * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and aoqi@0: * more). aoqi@0: *

aoqi@0: *

This method can be used to access XML documents which do not aoqi@0: * have URIs (such as servlet input streams, or most JavaMail message aoqi@0: * entities) and to support access methods such as HTTP POST or PUT. aoqi@0: * (URLs normally return content using the GET method.) aoqi@0: *

aoqi@0: *

The caller should set the system ID in order for relative URIs aoqi@0: * found in this document to be interpreted correctly. In some cases, aoqi@0: * a custom resolver will need to be used; for example, documents aoqi@0: * may be grouped in a single MIME "multipart/related" bundle, and aoqi@0: * relative URLs would refer to other documents in that bundle. aoqi@0: * aoqi@0: * @param contentType The MIME content type for the source for which aoqi@0: * an InputSource is desired, such as text/xml;charset=utf-8. aoqi@0: * @param stream The input byte stream for the input source. aoqi@0: * @param checkType If true, this verifies that the content type is known aoqi@0: * to support XML documents, such as application/xml. aoqi@0: * @param scheme Unless this is "file", unspecified MIME types aoqi@0: * default to US-ASCII. Files are always autodetected since most aoqi@0: * file systems discard character encoding information. aoqi@0: */ aoqi@0: public static InputSource createInputSource(String contentType, aoqi@0: InputStream stream, aoqi@0: boolean checkType, aoqi@0: String scheme) throws IOException { aoqi@0: InputSource retval; aoqi@0: String charset = null; aoqi@0: aoqi@0: if (contentType != null) { aoqi@0: int index; aoqi@0: aoqi@0: contentType = contentType.toLowerCase(); aoqi@0: index = contentType.indexOf(';'); aoqi@0: if (index != -1) { aoqi@0: String attributes; aoqi@0: aoqi@0: attributes = contentType.substring(index + 1); aoqi@0: contentType = contentType.substring(0, index); aoqi@0: aoqi@0: // use "charset=..." if it's available aoqi@0: index = attributes.indexOf("charset"); aoqi@0: if (index != -1) { aoqi@0: attributes = attributes.substring(index + 7); aoqi@0: // strip out subsequent attributes aoqi@0: if ((index = attributes.indexOf(';')) != -1) aoqi@0: attributes = attributes.substring(0, index); aoqi@0: // find start of value aoqi@0: if ((index = attributes.indexOf('=')) != -1) { aoqi@0: attributes = attributes.substring(index + 1); aoqi@0: // strip out rfc822 comments aoqi@0: if ((index = attributes.indexOf('(')) != -1) aoqi@0: attributes = attributes.substring(0, index); aoqi@0: // double quotes are optional aoqi@0: if ((index = attributes.indexOf('"')) != -1) { aoqi@0: attributes = attributes.substring(index + 1); aoqi@0: attributes = attributes.substring(0, aoqi@0: attributes.indexOf('"')); aoqi@0: } aoqi@0: charset = attributes.trim(); aoqi@0: // XXX "\;", "\)" etc were mishandled above aoqi@0: } aoqi@0: } aoqi@0: } aoqi@0: aoqi@0: // aoqi@0: // Check MIME type. aoqi@0: // aoqi@0: if (checkType) { aoqi@0: boolean isOK = false; aoqi@0: for (int i = 0; i < types.length; i++) aoqi@0: if (types[i].equals(contentType)) { aoqi@0: isOK = true; aoqi@0: break; aoqi@0: } aoqi@0: if (!isOK) aoqi@0: throw new IOException("Not XML: " + contentType); aoqi@0: } aoqi@0: aoqi@0: // aoqi@0: // "text/*" MIME types have hard-wired character set aoqi@0: // defaults, as specified in the RFCs. For XML, we aoqi@0: // ignore the system "file.encoding" property since aoqi@0: // autodetection is more correct. aoqi@0: // aoqi@0: if (charset == null) { aoqi@0: contentType = contentType.trim(); aoqi@0: if (contentType.startsWith("text/")) { aoqi@0: if (!"file".equalsIgnoreCase(scheme)) aoqi@0: charset = "US-ASCII"; aoqi@0: } aoqi@0: // "application/*" has no default aoqi@0: } aoqi@0: } aoqi@0: aoqi@0: retval = new InputSource(XmlReader.createReader(stream, charset)); aoqi@0: retval.setByteStream(stream); aoqi@0: retval.setEncoding(charset); aoqi@0: return retval; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * Creates an input source from a given URI. aoqi@0: * aoqi@0: * @param uri the URI (system ID) for the entity aoqi@0: * @param checkType if true, the MIME content type for the entity aoqi@0: * is checked for document type and character set encoding. aoqi@0: */ aoqi@0: static public InputSource createInputSource(URL uri, boolean checkType) aoqi@0: throws IOException { aoqi@0: aoqi@0: URLConnection conn = uri.openConnection(); aoqi@0: InputSource retval; aoqi@0: aoqi@0: if (checkType) { aoqi@0: String contentType = conn.getContentType(); aoqi@0: retval = createInputSource(contentType, conn.getInputStream(), aoqi@0: false, uri.getProtocol()); aoqi@0: } else { aoqi@0: retval = new InputSource(XmlReader.createReader(conn.getInputStream())); aoqi@0: } aoqi@0: retval.setSystemId(conn.getURL().toString()); aoqi@0: return retval; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * Creates an input source from a given file, autodetecting aoqi@0: * the character encoding. aoqi@0: */ aoqi@0: static public InputSource createInputSource(File file) aoqi@0: throws IOException { aoqi@0: InputSource retval; aoqi@0: String path; aoqi@0: aoqi@0: retval = new InputSource(XmlReader.createReader(new FileInputStream(file))); aoqi@0: aoqi@0: // On JDK 1.2 and later, simplify this: aoqi@0: // "path = file.toURL ().toString ()". aoqi@0: path = file.getAbsolutePath(); aoqi@0: if (File.separatorChar != '/') aoqi@0: path = path.replace(File.separatorChar, '/'); aoqi@0: if (!path.startsWith("/")) aoqi@0: path = "/" + path; aoqi@0: if (!path.endsWith("/") && file.isDirectory()) aoqi@0: path = path + "/"; aoqi@0: aoqi@0: retval.setSystemId("file:" + path); aoqi@0: return retval; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * SAX: aoqi@0: * Resolve the given entity into an input source. If the name can't aoqi@0: * be mapped to a preferred form of the entity, the URI is used. To aoqi@0: * resolve the entity, first a local catalog mapping names to URIs is aoqi@0: * consulted. If no mapping is found there, a catalog mapping names aoqi@0: * to java resources is consulted. Finally, if neither mapping found aoqi@0: * a copy of the entity, the specified URI is used. aoqi@0: *

aoqi@0: *

When a URI is used, aoqi@0: * createInputSource is used to correctly deduce the character aoqi@0: * encoding used by this entity. No MIME type checking is done. aoqi@0: * aoqi@0: * @param name Used to find alternate copies of the entity, when aoqi@0: * this value is non-null; this is the XML "public ID". aoqi@0: * @param uri Used when no alternate copy of the entity is found; aoqi@0: * this is the XML "system ID", normally a URI. aoqi@0: */ aoqi@0: public InputSource resolveEntity(String name, String uri) aoqi@0: throws IOException { aoqi@0: InputSource retval; aoqi@0: String mappedURI = name2uri(name); aoqi@0: InputStream stream; aoqi@0: aoqi@0: // prefer explicit URI mappings, then bundled resources... aoqi@0: if (mappedURI == null && (stream = mapResource(name)) != null) { aoqi@0: uri = "java:resource:" + (String) id2resource.get(name); aoqi@0: retval = new InputSource(XmlReader.createReader(stream)); aoqi@0: aoqi@0: // ...and treat all URIs the same (as URLs for now). aoqi@0: } else { aoqi@0: URL url; aoqi@0: URLConnection conn; aoqi@0: aoqi@0: if (mappedURI != null) aoqi@0: uri = mappedURI; aoqi@0: else if (uri == null) aoqi@0: return null; aoqi@0: aoqi@0: url = new URL(uri); aoqi@0: conn = url.openConnection(); aoqi@0: uri = conn.getURL().toString(); aoqi@0: // System.out.println ("++ URI: " + url); aoqi@0: if (ignoringMIME) aoqi@0: retval = new InputSource(XmlReader.createReader(conn.getInputStream())); aoqi@0: else { aoqi@0: String contentType = conn.getContentType(); aoqi@0: retval = createInputSource(contentType, aoqi@0: conn.getInputStream(), aoqi@0: false, url.getProtocol()); aoqi@0: } aoqi@0: } aoqi@0: retval.setSystemId(uri); aoqi@0: retval.setPublicId(name); aoqi@0: return retval; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * Returns true if this resolver is ignoring MIME types in the documents aoqi@0: * it returns, to work around bugs in how servers have reported the aoqi@0: * documents' MIME types. aoqi@0: */ aoqi@0: public boolean isIgnoringMIME() { aoqi@0: return ignoringMIME; aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Tells the resolver whether to ignore MIME types in the documents it aoqi@0: * retrieves. Many web servers incorrectly assign text documents a aoqi@0: * default character encoding, even when that is incorrect. For example, aoqi@0: * all HTTP text documents default to use ISO-8859-1 (used for Western aoqi@0: * European languages), and other MIME sources default text documents aoqi@0: * to use US-ASCII (a seven bit encoding). For XML documents which aoqi@0: * include text encoding declarations (as most should do), these server aoqi@0: * bugs can be worked around by ignoring the MIME type entirely. aoqi@0: */ aoqi@0: public void setIgnoringMIME(boolean value) { aoqi@0: ignoringMIME = value; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: // maps the public ID to an alternate URI, if one is registered aoqi@0: private String name2uri(String publicId) { aoqi@0: if (publicId == null || id2uri == null) aoqi@0: return null; aoqi@0: return (String) id2uri.get(publicId); aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * Registers the given public ID as corresponding to a particular aoqi@0: * URI, typically a local copy. This URI will be used in preference aoqi@0: * to ones provided as system IDs in XML entity declarations. This aoqi@0: * mechanism would most typically be used for Document Type Definitions aoqi@0: * (DTDs), where the public IDs are formally managed and versioned. aoqi@0: * aoqi@0: * @param publicId The managed public ID being mapped aoqi@0: * @param uri The URI of the preferred copy of that entity aoqi@0: */ aoqi@0: public void registerCatalogEntry(String publicId, aoqi@0: String uri) { aoqi@0: if (id2uri == null) aoqi@0: id2uri = new Hashtable(17); aoqi@0: id2uri.put(publicId, uri); aoqi@0: } aoqi@0: aoqi@0: aoqi@0: // return the resource as a stream aoqi@0: private InputStream mapResource(String publicId) { aoqi@0: // System.out.println ("++ PUBLIC: " + publicId); aoqi@0: if (publicId == null || id2resource == null) aoqi@0: return null; aoqi@0: aoqi@0: String resourceName = (String) id2resource.get(publicId); aoqi@0: ClassLoader loader = null; aoqi@0: aoqi@0: if (resourceName == null) aoqi@0: return null; aoqi@0: // System.out.println ("++ Resource: " + resourceName); aoqi@0: aoqi@0: if (id2loader != null) aoqi@0: loader = (ClassLoader) id2loader.get(publicId); aoqi@0: // System.out.println ("++ Loader: " + loader); aoqi@0: if (loader == null) aoqi@0: return ClassLoader.getSystemResourceAsStream(resourceName); aoqi@0: return loader.getResourceAsStream(resourceName); aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Registers a given public ID as corresponding to a particular Java aoqi@0: * resource in a given class loader, typically distributed with a aoqi@0: * software package. This resource will be preferred over system IDs aoqi@0: * included in XML documents. This mechanism should most typically be aoqi@0: * used for Document Type Definitions (DTDs), where the public IDs are aoqi@0: * formally managed and versioned. aoqi@0: *

aoqi@0: *

If a mapping to a URI has been provided, that mapping takes aoqi@0: * precedence over this one. aoqi@0: * aoqi@0: * @param publicId The managed public ID being mapped aoqi@0: * @param resourceName The name of the Java resource aoqi@0: * @param loader The class loader holding the resource, or null if aoqi@0: * it is a system resource. aoqi@0: */ aoqi@0: public void registerCatalogEntry(String publicId, aoqi@0: String resourceName, aoqi@0: ClassLoader loader) { aoqi@0: if (id2resource == null) aoqi@0: id2resource = new Hashtable(17); aoqi@0: id2resource.put(publicId, resourceName); aoqi@0: aoqi@0: if (loader != null) { aoqi@0: if (id2loader == null) aoqi@0: id2loader = new Hashtable(17); aoqi@0: id2loader.put(publicId, loader); aoqi@0: } aoqi@0: } aoqi@0: }