aoqi@0: /* aoqi@0: * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. aoqi@0: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. aoqi@0: * aoqi@0: * This code is free software; you can redistribute it and/or modify it aoqi@0: * under the terms of the GNU General Public License version 2 only, as aoqi@0: * published by the Free Software Foundation. Oracle designates this aoqi@0: * particular file as subject to the "Classpath" exception as provided aoqi@0: * by Oracle in the LICENSE file that accompanied this code. aoqi@0: * aoqi@0: * This code is distributed in the hope that it will be useful, but WITHOUT aoqi@0: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or aoqi@0: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License aoqi@0: * version 2 for more details (a copy is included in the LICENSE file that aoqi@0: * accompanied this code). aoqi@0: * aoqi@0: * You should have received a copy of the GNU General Public License version aoqi@0: * 2 along with this work; if not, write to the Free Software Foundation, aoqi@0: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. aoqi@0: * aoqi@0: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA aoqi@0: * or visit www.oracle.com if you need additional information or have any aoqi@0: * questions. aoqi@0: */ aoqi@0: aoqi@0: package com.sun.xml.internal.dtdparser; aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * Methods in this class are used to determine whether characters may aoqi@0: * appear in certain roles in XML documents. Such methods are used aoqi@0: * both to parse and to create such documents. aoqi@0: * aoqi@0: * @author David Brownell aoqi@0: * @version 1.1, 00/08/05 aoqi@0: */ aoqi@0: public class XmlChars { aoqi@0: // can't construct instances aoqi@0: private XmlChars() { aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Returns true if the argument, a UCS-4 character code, is valid in aoqi@0: * XML documents. Unicode characters fit into the low sixteen aoqi@0: * bits of a UCS-4 character, and pairs of Unicode surrogate aoqi@0: * characters can be combined to encode UCS-4 characters in aoqi@0: * documents containing only Unicode. (The char datatype aoqi@0: * in the Java Programming Language represents Unicode characters, aoqi@0: * including unpaired surrogates.) aoqi@0: *

aoqi@0: *

In XML, UCS-4 characters can also be encoded by the use of aoqi@0: * character references such as &#x12345678;, which aoqi@0: * happens to refer to a character that is disallowed in XML documents. aoqi@0: * UCS-4 characters allowed in XML documents can be expressed with aoqi@0: * one or two Unicode characters. aoqi@0: * aoqi@0: * @param ucs4char The 32-bit UCS-4 character being tested. aoqi@0: */ aoqi@0: static public boolean isChar(int ucs4char) { aoqi@0: // [2] Char ::= #x0009 | #x000A | #x000D aoqi@0: // | [#x0020-#xD7FF] aoqi@0: // ... surrogates excluded! aoqi@0: // | [#xE000-#xFFFD] aoqi@0: // | [#x10000-#x10ffff] aoqi@0: return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) aoqi@0: || ucs4char == 0x000A || ucs4char == 0x0009 aoqi@0: || ucs4char == 0x000D aoqi@0: || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) aoqi@0: || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Returns true if the character is allowed to be a non-initial aoqi@0: * character in names according to the XML recommendation. aoqi@0: * aoqi@0: * @see #isNCNameChar(char) aoqi@0: * @see #isLetter(char) aoqi@0: */ aoqi@0: public static boolean isNameChar(char c) { aoqi@0: // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' aoqi@0: // | CombiningChar | Extender aoqi@0: aoqi@0: if (isLetter2(c)) aoqi@0: return true; aoqi@0: else if (c == '>') aoqi@0: return false; aoqi@0: else if (c == '.' || c == '-' || c == '_' || c == ':' aoqi@0: || isExtender(c)) aoqi@0: return true; aoqi@0: else aoqi@0: return false; aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Returns true if the character is allowed to be a non-initial aoqi@0: * character in unscoped names according to the rules of the XML aoqi@0: * Namespaces proposed recommendation. Except for precluding aoqi@0: * the colon (used to separate names from their scopes) these aoqi@0: * characters are just as allowed by the XML recommendation. aoqi@0: * aoqi@0: * @see #isNameChar(char) aoqi@0: * @see #isLetter(char) aoqi@0: */ aoqi@0: public static boolean isNCNameChar(char c) { aoqi@0: // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' aoqi@0: // | CombiningChar | Extender aoqi@0: return c != ':' && isNameChar(c); aoqi@0: } aoqi@0: aoqi@0: /** aoqi@0: * Returns true if the character is allowed where XML supports aoqi@0: * whitespace characters, false otherwise. aoqi@0: */ aoqi@0: public static boolean isSpace(char c) { aoqi@0: return c == ' ' || c == '\t' || c == '\n' || c == '\r'; aoqi@0: } aoqi@0: aoqi@0: aoqi@0: /* aoqi@0: * NOTE: java.lang.Character.getType() values are: aoqi@0: * aoqi@0: * UNASSIGNED = 0, aoqi@0: * aoqi@0: * UPPERCASE_LETTER = 1, // Lu aoqi@0: * LOWERCASE_LETTER = 2, // Ll aoqi@0: * TITLECASE_LETTER = 3, // Lt aoqi@0: * MODIFIER_LETTER = 4, // Lm aoqi@0: * OTHER_LETTER = 5, // Lo aoqi@0: * NON_SPACING_MARK = 6, // Mn aoqi@0: * ENCLOSING_MARK = 7, // Me aoqi@0: * COMBINING_SPACING_MARK = 8, // Mc aoqi@0: * DECIMAL_DIGIT_NUMBER = 9, // Nd aoqi@0: * LETTER_NUMBER = 10, // Nl aoqi@0: * OTHER_NUMBER = 11, // No aoqi@0: * SPACE_SEPARATOR = 12, // Zs aoqi@0: * LINE_SEPARATOR = 13, // Zl aoqi@0: * PARAGRAPH_SEPARATOR = 14, // Zp aoqi@0: * CONTROL = 15, // Cc aoqi@0: * FORMAT = 16, // Cf aoqi@0: * // 17 reserved for proposed Ci category aoqi@0: * PRIVATE_USE = 18, // Co aoqi@0: * SURROGATE = 19, // Cs aoqi@0: * DASH_PUNCTUATION = 20, // Pd aoqi@0: * START_PUNCTUATION = 21, // Ps aoqi@0: * END_PUNCTUATION = 22, // Pe aoqi@0: * CONNECTOR_PUNCTUATION = 23, // Pc aoqi@0: * OTHER_PUNCTUATION = 24, // Po aoqi@0: * MATH_SYMBOL = 25, // Sm aoqi@0: * CURRENCY_SYMBOL = 26, // Sc aoqi@0: * MODIFIER_SYMBOL = 27, // Sk aoqi@0: * OTHER_SYMBOL = 28; // So aoqi@0: */ aoqi@0: aoqi@0: /** aoqi@0: * Returns true if the character is an XML "letter". XML Names must aoqi@0: * start with Letters or a few other characters, but other characters aoqi@0: * in names must only satisfy the isNameChar predicate. aoqi@0: * aoqi@0: * @see #isNameChar(char) aoqi@0: * @see #isNCNameChar(char) aoqi@0: */ aoqi@0: public static boolean isLetter(char c) { aoqi@0: // [84] Letter ::= BaseChar | Ideographic aoqi@0: // [85] BaseChar ::= ... too much to repeat aoqi@0: // [86] Ideographic ::= ... too much to repeat aoqi@0: aoqi@0: // aoqi@0: // Optimize the typical case. aoqi@0: // aoqi@0: if (c >= 'a' && c <= 'z') aoqi@0: return true; aoqi@0: if (c == '/') aoqi@0: return false; aoqi@0: if (c >= 'A' && c <= 'Z') aoqi@0: return true; aoqi@0: aoqi@0: // aoqi@0: // Since the tables are too ridiculous to use in code, aoqi@0: // we're using the footnotes here to drive this test. aoqi@0: // aoqi@0: switch (Character.getType(c)) { aoqi@0: // app. B footnote says these are 'name start' aoqi@0: // chars' ... aoqi@0: case Character.LOWERCASE_LETTER: // Ll aoqi@0: case Character.UPPERCASE_LETTER: // Lu aoqi@0: case Character.OTHER_LETTER: // Lo aoqi@0: case Character.TITLECASE_LETTER: // Lt aoqi@0: case Character.LETTER_NUMBER: // Nl aoqi@0: aoqi@0: // OK, here we just have some exceptions to check... aoqi@0: return !isCompatibilityChar(c) aoqi@0: // per "5.14 of Unicode", rule out some combiners aoqi@0: && !(c >= 0x20dd && c <= 0x20e0); aoqi@0: aoqi@0: default: aoqi@0: // check for some exceptions: these are "alphabetic" aoqi@0: return ((c >= 0x02bb && c <= 0x02c1) aoqi@0: || c == 0x0559 || c == 0x06e5 || c == 0x06e6); aoqi@0: } aoqi@0: } aoqi@0: aoqi@0: // aoqi@0: // XML 1.0 discourages "compatibility" characters in names; these aoqi@0: // were defined to permit passing through some information stored in aoqi@0: // older non-Unicode character sets. These always have alternative aoqi@0: // representations in Unicode, e.g. using combining chars. aoqi@0: // aoqi@0: private static boolean isCompatibilityChar(char c) { aoqi@0: // the numerous comparisions here seem unavoidable, aoqi@0: // but the switch can reduce the number which must aoqi@0: // actually be executed. aoqi@0: aoqi@0: switch ((c >> 8) & 0x0ff) { aoqi@0: case 0x00: aoqi@0: // ISO Latin/1 has a few compatibility characters aoqi@0: return c == 0x00aa || c == 0x00b5 || c == 0x00ba; aoqi@0: aoqi@0: case 0x01: aoqi@0: // as do Latin Extended A and (parts of) B aoqi@0: return (c >= 0x0132 && c <= 0x0133) aoqi@0: || (c >= 0x013f && c <= 0x0140) aoqi@0: || c == 0x0149 aoqi@0: || c == 0x017f aoqi@0: || (c >= 0x01c4 && c <= 0x01cc) aoqi@0: || (c >= 0x01f1 && c <= 0x01f3); aoqi@0: aoqi@0: case 0x02: aoqi@0: // some spacing modifiers aoqi@0: return (c >= 0x02b0 && c <= 0x02b8) aoqi@0: || (c >= 0x02e0 && c <= 0x02e4); aoqi@0: aoqi@0: case 0x03: aoqi@0: return c == 0x037a; // Greek aoqi@0: aoqi@0: case 0x05: aoqi@0: return c == 0x0587; // Armenian aoqi@0: aoqi@0: case 0x0e: aoqi@0: return c >= 0x0edc && c <= 0x0edd; // Laotian aoqi@0: aoqi@0: case 0x11: aoqi@0: // big chunks of Hangul Jamo are all "compatibility" aoqi@0: return c == 0x1101 aoqi@0: || c == 0x1104 aoqi@0: || c == 0x1108 aoqi@0: || c == 0x110a aoqi@0: || c == 0x110d aoqi@0: || (c >= 0x1113 && c <= 0x113b) aoqi@0: || c == 0x113d aoqi@0: || c == 0x113f aoqi@0: || (c >= 0x1141 && c <= 0x114b) aoqi@0: || c == 0x114d aoqi@0: || c == 0x114f aoqi@0: || (c >= 0x1151 && c <= 0x1153) aoqi@0: || (c >= 0x1156 && c <= 0x1158) aoqi@0: || c == 0x1162 aoqi@0: || c == 0x1164 aoqi@0: || c == 0x1166 aoqi@0: || c == 0x1168 aoqi@0: || (c >= 0x116a && c <= 0x116c) aoqi@0: || (c >= 0x116f && c <= 0x1171) aoqi@0: || c == 0x1174 aoqi@0: || (c >= 0x1176 && c <= 0x119d) aoqi@0: || (c >= 0x119f && c <= 0x11a2) aoqi@0: || (c >= 0x11a9 && c <= 0x11aa) aoqi@0: || (c >= 0x11ac && c <= 0x11ad) aoqi@0: || (c >= 0x11b0 && c <= 0x11b6) aoqi@0: || c == 0x11b9 aoqi@0: || c == 0x11bb aoqi@0: || (c >= 0x11c3 && c <= 0x11ea) aoqi@0: || (c >= 0x11ec && c <= 0x11ef) aoqi@0: || (c >= 0x11f1 && c <= 0x11f8) aoqi@0: ; aoqi@0: aoqi@0: case 0x20: aoqi@0: return c == 0x207f; // superscript aoqi@0: aoqi@0: case 0x21: aoqi@0: return aoqi@0: // various letterlike symbols aoqi@0: c == 0x2102 aoqi@0: || c == 0x2107 aoqi@0: || (c >= 0x210a && c <= 0x2113) aoqi@0: || c == 0x2115 aoqi@0: || (c >= 0x2118 && c <= 0x211d) aoqi@0: || c == 0x2124 aoqi@0: || c == 0x2128 aoqi@0: || (c >= 0x212c && c <= 0x212d) aoqi@0: || (c >= 0x212f && c <= 0x2138) aoqi@0: aoqi@0: // most Roman numerals (less 1K, 5K, 10K) aoqi@0: || (c >= 0x2160 && c <= 0x217f) aoqi@0: ; aoqi@0: aoqi@0: case 0x30: aoqi@0: // some Hiragana aoqi@0: return c >= 0x309b && c <= 0x309c; aoqi@0: aoqi@0: case 0x31: aoqi@0: // all Hangul Compatibility Jamo aoqi@0: return c >= 0x3131 && c <= 0x318e; aoqi@0: aoqi@0: case 0xf9: aoqi@0: case 0xfa: aoqi@0: case 0xfb: aoqi@0: case 0xfc: aoqi@0: case 0xfd: aoqi@0: case 0xfe: aoqi@0: case 0xff: aoqi@0: // the whole "compatibility" area is for that purpose! aoqi@0: return true; aoqi@0: aoqi@0: default: aoqi@0: // most of Unicode isn't flagged as being for compatibility aoqi@0: return false; aoqi@0: } aoqi@0: } aoqi@0: aoqi@0: // guts of isNameChar/isNCNameChar aoqi@0: private static boolean isLetter2(char c) { aoqi@0: // [84] Letter ::= BaseChar | Ideographic aoqi@0: // [85] BaseChar ::= ... too much to repeat aoqi@0: // [86] Ideographic ::= ... too much to repeat aoqi@0: // [87] CombiningChar ::= ... too much to repeat aoqi@0: aoqi@0: // aoqi@0: // Optimize the typical case. aoqi@0: // aoqi@0: if (c >= 'a' && c <= 'z') aoqi@0: return true; aoqi@0: if (c == '>') aoqi@0: return false; aoqi@0: if (c >= 'A' && c <= 'Z') aoqi@0: return true; aoqi@0: aoqi@0: // aoqi@0: // Since the tables are too ridiculous to use in code, aoqi@0: // we're using the footnotes here to drive this test. aoqi@0: // aoqi@0: switch (Character.getType(c)) { aoqi@0: // app. B footnote says these are 'name start' aoqi@0: // chars' ... aoqi@0: case Character.LOWERCASE_LETTER: // Ll aoqi@0: case Character.UPPERCASE_LETTER: // Lu aoqi@0: case Character.OTHER_LETTER: // Lo aoqi@0: case Character.TITLECASE_LETTER: // Lt aoqi@0: case Character.LETTER_NUMBER: // Nl aoqi@0: // ... and these are name characters 'other aoqi@0: // than name start characters' aoqi@0: case Character.COMBINING_SPACING_MARK: // Mc aoqi@0: case Character.ENCLOSING_MARK: // Me aoqi@0: case Character.NON_SPACING_MARK: // Mn aoqi@0: case Character.MODIFIER_LETTER: // Lm aoqi@0: case Character.DECIMAL_DIGIT_NUMBER: // Nd aoqi@0: aoqi@0: // OK, here we just have some exceptions to check... aoqi@0: return !isCompatibilityChar(c) aoqi@0: // per "5.14 of Unicode", rule out some combiners aoqi@0: && !(c >= 0x20dd && c <= 0x20e0); aoqi@0: aoqi@0: default: aoqi@0: // added a character ... aoqi@0: return c == 0x0387; aoqi@0: } aoqi@0: } aoqi@0: aoqi@0: private static boolean isDigit(char c) { aoqi@0: // [88] Digit ::= ... aoqi@0: aoqi@0: // aoqi@0: // java.lang.Character.isDigit is correct from the XML point aoqi@0: // of view except that it allows "fullwidth" digits. aoqi@0: // aoqi@0: return Character.isDigit(c) aoqi@0: && !((c >= 0xff10) && (c <= 0xff19)); aoqi@0: } aoqi@0: aoqi@0: private static boolean isExtender(char c) { aoqi@0: // [89] Extender ::= ... aoqi@0: return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 aoqi@0: || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 aoqi@0: || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) aoqi@0: || (c >= 0x309d && c <= 0x309e) aoqi@0: || (c >= 0x30fc && c <= 0x30fe) aoqi@0: ; aoqi@0: } aoqi@0: }