ohair@286: /* ohair@286: * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. ohair@286: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ohair@286: * ohair@286: * This code is free software; you can redistribute it and/or modify it ohair@286: * under the terms of the GNU General Public License version 2 only, as ohair@286: * published by the Free Software Foundation. Oracle designates this ohair@286: * particular file as subject to the "Classpath" exception as provided ohair@286: * by Oracle in the LICENSE file that accompanied this code. ohair@286: * ohair@286: * This code is distributed in the hope that it will be useful, but WITHOUT ohair@286: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ohair@286: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ohair@286: * version 2 for more details (a copy is included in the LICENSE file that ohair@286: * accompanied this code). ohair@286: * ohair@286: * You should have received a copy of the GNU General Public License version ohair@286: * 2 along with this work; if not, write to the Free Software Foundation, ohair@286: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ohair@286: * ohair@286: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ohair@286: * or visit www.oracle.com if you need additional information or have any ohair@286: * questions. ohair@286: */ ohair@286: ohair@286: package com.sun.xml.internal.dtdparser; ohair@286: ohair@286: ohair@286: /** ohair@286: * Methods in this class are used to determine whether characters may ohair@286: * appear in certain roles in XML documents. Such methods are used ohair@286: * both to parse and to create such documents. ohair@286: * ohair@286: * @author David Brownell ohair@286: * @version 1.1, 00/08/05 ohair@286: */ ohair@286: public class XmlChars { ohair@286: // can't construct instances ohair@286: private XmlChars() { ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns true if the argument, a UCS-4 character code, is valid in ohair@286: * XML documents. Unicode characters fit into the low sixteen ohair@286: * bits of a UCS-4 character, and pairs of Unicode surrogate ohair@286: * characters can be combined to encode UCS-4 characters in ohair@286: * documents containing only Unicode. (The char datatype ohair@286: * in the Java Programming Language represents Unicode characters, ohair@286: * including unpaired surrogates.) ohair@286: *

ohair@286: *

In XML, UCS-4 characters can also be encoded by the use of ohair@286: * character references such as &#x12345678;, which ohair@286: * happens to refer to a character that is disallowed in XML documents. ohair@286: * UCS-4 characters allowed in XML documents can be expressed with ohair@286: * one or two Unicode characters. ohair@286: * ohair@286: * @param ucs4char The 32-bit UCS-4 character being tested. ohair@286: */ ohair@286: static public boolean isChar(int ucs4char) { ohair@286: // [2] Char ::= #x0009 | #x000A | #x000D ohair@286: // | [#x0020-#xD7FF] ohair@286: // ... surrogates excluded! ohair@286: // | [#xE000-#xFFFD] ohair@286: // | [#x10000-#x10ffff] ohair@286: return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) ohair@286: || ucs4char == 0x000A || ucs4char == 0x0009 ohair@286: || ucs4char == 0x000D ohair@286: || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) ohair@286: || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns true if the character is allowed to be a non-initial ohair@286: * character in names according to the XML recommendation. ohair@286: * ohair@286: * @see #isNCNameChar(char) ohair@286: * @see #isLetter(char) ohair@286: */ ohair@286: public static boolean isNameChar(char c) { ohair@286: // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' ohair@286: // | CombiningChar | Extender ohair@286: ohair@286: if (isLetter2(c)) ohair@286: return true; ohair@286: else if (c == '>') ohair@286: return false; ohair@286: else if (c == '.' || c == '-' || c == '_' || c == ':' ohair@286: || isExtender(c)) ohair@286: return true; ohair@286: else ohair@286: return false; ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns true if the character is allowed to be a non-initial ohair@286: * character in unscoped names according to the rules of the XML ohair@286: * Namespaces proposed recommendation. Except for precluding ohair@286: * the colon (used to separate names from their scopes) these ohair@286: * characters are just as allowed by the XML recommendation. ohair@286: * ohair@286: * @see #isNameChar(char) ohair@286: * @see #isLetter(char) ohair@286: */ ohair@286: public static boolean isNCNameChar(char c) { ohair@286: // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' ohair@286: // | CombiningChar | Extender ohair@286: return c != ':' && isNameChar(c); ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns true if the character is allowed where XML supports ohair@286: * whitespace characters, false otherwise. ohair@286: */ ohair@286: public static boolean isSpace(char c) { ohair@286: return c == ' ' || c == '\t' || c == '\n' || c == '\r'; ohair@286: } ohair@286: ohair@286: ohair@286: /* ohair@286: * NOTE: java.lang.Character.getType() values are: ohair@286: * ohair@286: * UNASSIGNED = 0, ohair@286: * ohair@286: * UPPERCASE_LETTER = 1, // Lu ohair@286: * LOWERCASE_LETTER = 2, // Ll ohair@286: * TITLECASE_LETTER = 3, // Lt ohair@286: * MODIFIER_LETTER = 4, // Lm ohair@286: * OTHER_LETTER = 5, // Lo ohair@286: * NON_SPACING_MARK = 6, // Mn ohair@286: * ENCLOSING_MARK = 7, // Me ohair@286: * COMBINING_SPACING_MARK = 8, // Mc ohair@286: * DECIMAL_DIGIT_NUMBER = 9, // Nd ohair@286: * LETTER_NUMBER = 10, // Nl ohair@286: * OTHER_NUMBER = 11, // No ohair@286: * SPACE_SEPARATOR = 12, // Zs ohair@286: * LINE_SEPARATOR = 13, // Zl ohair@286: * PARAGRAPH_SEPARATOR = 14, // Zp ohair@286: * CONTROL = 15, // Cc ohair@286: * FORMAT = 16, // Cf ohair@286: * // 17 reserved for proposed Ci category ohair@286: * PRIVATE_USE = 18, // Co ohair@286: * SURROGATE = 19, // Cs ohair@286: * DASH_PUNCTUATION = 20, // Pd ohair@286: * START_PUNCTUATION = 21, // Ps ohair@286: * END_PUNCTUATION = 22, // Pe ohair@286: * CONNECTOR_PUNCTUATION = 23, // Pc ohair@286: * OTHER_PUNCTUATION = 24, // Po ohair@286: * MATH_SYMBOL = 25, // Sm ohair@286: * CURRENCY_SYMBOL = 26, // Sc ohair@286: * MODIFIER_SYMBOL = 27, // Sk ohair@286: * OTHER_SYMBOL = 28; // So ohair@286: */ ohair@286: ohair@286: /** ohair@286: * Returns true if the character is an XML "letter". XML Names must ohair@286: * start with Letters or a few other characters, but other characters ohair@286: * in names must only satisfy the isNameChar predicate. ohair@286: * ohair@286: * @see #isNameChar(char) ohair@286: * @see #isNCNameChar(char) ohair@286: */ ohair@286: public static boolean isLetter(char c) { ohair@286: // [84] Letter ::= BaseChar | Ideographic ohair@286: // [85] BaseChar ::= ... too much to repeat ohair@286: // [86] Ideographic ::= ... too much to repeat ohair@286: ohair@286: // ohair@286: // Optimize the typical case. ohair@286: // ohair@286: if (c >= 'a' && c <= 'z') ohair@286: return true; ohair@286: if (c == '/') ohair@286: return false; ohair@286: if (c >= 'A' && c <= 'Z') ohair@286: return true; ohair@286: ohair@286: // ohair@286: // Since the tables are too ridiculous to use in code, ohair@286: // we're using the footnotes here to drive this test. ohair@286: // ohair@286: switch (Character.getType(c)) { ohair@286: // app. B footnote says these are 'name start' ohair@286: // chars' ... ohair@286: case Character.LOWERCASE_LETTER: // Ll ohair@286: case Character.UPPERCASE_LETTER: // Lu ohair@286: case Character.OTHER_LETTER: // Lo ohair@286: case Character.TITLECASE_LETTER: // Lt ohair@286: case Character.LETTER_NUMBER: // Nl ohair@286: ohair@286: // OK, here we just have some exceptions to check... ohair@286: return !isCompatibilityChar(c) ohair@286: // per "5.14 of Unicode", rule out some combiners ohair@286: && !(c >= 0x20dd && c <= 0x20e0); ohair@286: ohair@286: default: ohair@286: // check for some exceptions: these are "alphabetic" ohair@286: return ((c >= 0x02bb && c <= 0x02c1) ohair@286: || c == 0x0559 || c == 0x06e5 || c == 0x06e6); ohair@286: } ohair@286: } ohair@286: ohair@286: // ohair@286: // XML 1.0 discourages "compatibility" characters in names; these ohair@286: // were defined to permit passing through some information stored in ohair@286: // older non-Unicode character sets. These always have alternative ohair@286: // representations in Unicode, e.g. using combining chars. ohair@286: // ohair@286: private static boolean isCompatibilityChar(char c) { ohair@286: // the numerous comparisions here seem unavoidable, ohair@286: // but the switch can reduce the number which must ohair@286: // actually be executed. ohair@286: ohair@286: switch ((c >> 8) & 0x0ff) { ohair@286: case 0x00: ohair@286: // ISO Latin/1 has a few compatibility characters ohair@286: return c == 0x00aa || c == 0x00b5 || c == 0x00ba; ohair@286: ohair@286: case 0x01: ohair@286: // as do Latin Extended A and (parts of) B ohair@286: return (c >= 0x0132 && c <= 0x0133) ohair@286: || (c >= 0x013f && c <= 0x0140) ohair@286: || c == 0x0149 ohair@286: || c == 0x017f ohair@286: || (c >= 0x01c4 && c <= 0x01cc) ohair@286: || (c >= 0x01f1 && c <= 0x01f3); ohair@286: ohair@286: case 0x02: ohair@286: // some spacing modifiers ohair@286: return (c >= 0x02b0 && c <= 0x02b8) ohair@286: || (c >= 0x02e0 && c <= 0x02e4); ohair@286: ohair@286: case 0x03: ohair@286: return c == 0x037a; // Greek ohair@286: ohair@286: case 0x05: ohair@286: return c == 0x0587; // Armenian ohair@286: ohair@286: case 0x0e: ohair@286: return c >= 0x0edc && c <= 0x0edd; // Laotian ohair@286: ohair@286: case 0x11: ohair@286: // big chunks of Hangul Jamo are all "compatibility" ohair@286: return c == 0x1101 ohair@286: || c == 0x1104 ohair@286: || c == 0x1108 ohair@286: || c == 0x110a ohair@286: || c == 0x110d ohair@286: || (c >= 0x1113 && c <= 0x113b) ohair@286: || c == 0x113d ohair@286: || c == 0x113f ohair@286: || (c >= 0x1141 && c <= 0x114b) ohair@286: || c == 0x114d ohair@286: || c == 0x114f ohair@286: || (c >= 0x1151 && c <= 0x1153) ohair@286: || (c >= 0x1156 && c <= 0x1158) ohair@286: || c == 0x1162 ohair@286: || c == 0x1164 ohair@286: || c == 0x1166 ohair@286: || c == 0x1168 ohair@286: || (c >= 0x116a && c <= 0x116c) ohair@286: || (c >= 0x116f && c <= 0x1171) ohair@286: || c == 0x1174 ohair@286: || (c >= 0x1176 && c <= 0x119d) ohair@286: || (c >= 0x119f && c <= 0x11a2) ohair@286: || (c >= 0x11a9 && c <= 0x11aa) ohair@286: || (c >= 0x11ac && c <= 0x11ad) ohair@286: || (c >= 0x11b0 && c <= 0x11b6) ohair@286: || c == 0x11b9 ohair@286: || c == 0x11bb ohair@286: || (c >= 0x11c3 && c <= 0x11ea) ohair@286: || (c >= 0x11ec && c <= 0x11ef) ohair@286: || (c >= 0x11f1 && c <= 0x11f8) ohair@286: ; ohair@286: ohair@286: case 0x20: ohair@286: return c == 0x207f; // superscript ohair@286: ohair@286: case 0x21: ohair@286: return ohair@286: // various letterlike symbols ohair@286: c == 0x2102 ohair@286: || c == 0x2107 ohair@286: || (c >= 0x210a && c <= 0x2113) ohair@286: || c == 0x2115 ohair@286: || (c >= 0x2118 && c <= 0x211d) ohair@286: || c == 0x2124 ohair@286: || c == 0x2128 ohair@286: || (c >= 0x212c && c <= 0x212d) ohair@286: || (c >= 0x212f && c <= 0x2138) ohair@286: ohair@286: // most Roman numerals (less 1K, 5K, 10K) ohair@286: || (c >= 0x2160 && c <= 0x217f) ohair@286: ; ohair@286: ohair@286: case 0x30: ohair@286: // some Hiragana ohair@286: return c >= 0x309b && c <= 0x309c; ohair@286: ohair@286: case 0x31: ohair@286: // all Hangul Compatibility Jamo ohair@286: return c >= 0x3131 && c <= 0x318e; ohair@286: ohair@286: case 0xf9: ohair@286: case 0xfa: ohair@286: case 0xfb: ohair@286: case 0xfc: ohair@286: case 0xfd: ohair@286: case 0xfe: ohair@286: case 0xff: ohair@286: // the whole "compatibility" area is for that purpose! ohair@286: return true; ohair@286: ohair@286: default: ohair@286: // most of Unicode isn't flagged as being for compatibility ohair@286: return false; ohair@286: } ohair@286: } ohair@286: ohair@286: // guts of isNameChar/isNCNameChar ohair@286: private static boolean isLetter2(char c) { ohair@286: // [84] Letter ::= BaseChar | Ideographic ohair@286: // [85] BaseChar ::= ... too much to repeat ohair@286: // [86] Ideographic ::= ... too much to repeat ohair@286: // [87] CombiningChar ::= ... too much to repeat ohair@286: ohair@286: // ohair@286: // Optimize the typical case. ohair@286: // ohair@286: if (c >= 'a' && c <= 'z') ohair@286: return true; ohair@286: if (c == '>') ohair@286: return false; ohair@286: if (c >= 'A' && c <= 'Z') ohair@286: return true; ohair@286: ohair@286: // ohair@286: // Since the tables are too ridiculous to use in code, ohair@286: // we're using the footnotes here to drive this test. ohair@286: // ohair@286: switch (Character.getType(c)) { ohair@286: // app. B footnote says these are 'name start' ohair@286: // chars' ... ohair@286: case Character.LOWERCASE_LETTER: // Ll ohair@286: case Character.UPPERCASE_LETTER: // Lu ohair@286: case Character.OTHER_LETTER: // Lo ohair@286: case Character.TITLECASE_LETTER: // Lt ohair@286: case Character.LETTER_NUMBER: // Nl ohair@286: // ... and these are name characters 'other ohair@286: // than name start characters' ohair@286: case Character.COMBINING_SPACING_MARK: // Mc ohair@286: case Character.ENCLOSING_MARK: // Me ohair@286: case Character.NON_SPACING_MARK: // Mn ohair@286: case Character.MODIFIER_LETTER: // Lm ohair@286: case Character.DECIMAL_DIGIT_NUMBER: // Nd ohair@286: ohair@286: // OK, here we just have some exceptions to check... ohair@286: return !isCompatibilityChar(c) ohair@286: // per "5.14 of Unicode", rule out some combiners ohair@286: && !(c >= 0x20dd && c <= 0x20e0); ohair@286: ohair@286: default: ohair@286: // added a character ... ohair@286: return c == 0x0387; ohair@286: } ohair@286: } ohair@286: ohair@286: private static boolean isDigit(char c) { ohair@286: // [88] Digit ::= ... ohair@286: ohair@286: // ohair@286: // java.lang.Character.isDigit is correct from the XML point ohair@286: // of view except that it allows "fullwidth" digits. ohair@286: // ohair@286: return Character.isDigit(c) ohair@286: && !((c >= 0xff10) && (c <= 0xff19)); ohair@286: } ohair@286: ohair@286: private static boolean isExtender(char c) { ohair@286: // [89] Extender ::= ... ohair@286: return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 ohair@286: || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 ohair@286: || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) ohair@286: || (c >= 0x309d && c <= 0x309e) ohair@286: || (c >= 0x30fc && c <= 0x30fe) ohair@286: ; ohair@286: } ohair@286: }