1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java Wed Apr 27 01:27:09 2016 +0800 1.3 @@ -0,0 +1,387 @@ 1.4 +/* 1.5 + * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. 1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.7 + * 1.8 + * This code is free software; you can redistribute it and/or modify it 1.9 + * under the terms of the GNU General Public License version 2 only, as 1.10 + * published by the Free Software Foundation. Oracle designates this 1.11 + * particular file as subject to the "Classpath" exception as provided 1.12 + * by Oracle in the LICENSE file that accompanied this code. 1.13 + * 1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1.17 + * version 2 for more details (a copy is included in the LICENSE file that 1.18 + * accompanied this code). 1.19 + * 1.20 + * You should have received a copy of the GNU General Public License version 1.21 + * 2 along with this work; if not, write to the Free Software Foundation, 1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1.23 + * 1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 1.25 + * or visit www.oracle.com if you need additional information or have any 1.26 + * questions. 1.27 + */ 1.28 + 1.29 +package com.sun.xml.internal.dtdparser; 1.30 + 1.31 + 1.32 +/** 1.33 + * Methods in this class are used to determine whether characters may 1.34 + * appear in certain roles in XML documents. Such methods are used 1.35 + * both to parse and to create such documents. 1.36 + * 1.37 + * @author David Brownell 1.38 + * @version 1.1, 00/08/05 1.39 + */ 1.40 +public class XmlChars { 1.41 + // can't construct instances 1.42 + private XmlChars() { 1.43 + } 1.44 + 1.45 + /** 1.46 + * Returns true if the argument, a UCS-4 character code, is valid in 1.47 + * XML documents. Unicode characters fit into the low sixteen 1.48 + * bits of a UCS-4 character, and pairs of Unicode <em>surrogate 1.49 + * characters</em> can be combined to encode UCS-4 characters in 1.50 + * documents containing only Unicode. (The <code>char</code> datatype 1.51 + * in the Java Programming Language represents Unicode characters, 1.52 + * including unpaired surrogates.) 1.53 + * <p/> 1.54 + * <P> In XML, UCS-4 characters can also be encoded by the use of 1.55 + * <em>character references</em> such as <b>&#x12345678;</b>, which 1.56 + * happens to refer to a character that is disallowed in XML documents. 1.57 + * UCS-4 characters allowed in XML documents can be expressed with 1.58 + * one or two Unicode characters. 1.59 + * 1.60 + * @param ucs4char The 32-bit UCS-4 character being tested. 1.61 + */ 1.62 + static public boolean isChar(int ucs4char) { 1.63 + // [2] Char ::= #x0009 | #x000A | #x000D 1.64 + // | [#x0020-#xD7FF] 1.65 + // ... surrogates excluded! 1.66 + // | [#xE000-#xFFFD] 1.67 + // | [#x10000-#x10ffff] 1.68 + return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) 1.69 + || ucs4char == 0x000A || ucs4char == 0x0009 1.70 + || ucs4char == 0x000D 1.71 + || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) 1.72 + || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); 1.73 + } 1.74 + 1.75 + /** 1.76 + * Returns true if the character is allowed to be a non-initial 1.77 + * character in names according to the XML recommendation. 1.78 + * 1.79 + * @see #isNCNameChar(char) 1.80 + * @see #isLetter(char) 1.81 + */ 1.82 + public static boolean isNameChar(char c) { 1.83 + // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' 1.84 + // | CombiningChar | Extender 1.85 + 1.86 + if (isLetter2(c)) 1.87 + return true; 1.88 + else if (c == '>') 1.89 + return false; 1.90 + else if (c == '.' || c == '-' || c == '_' || c == ':' 1.91 + || isExtender(c)) 1.92 + return true; 1.93 + else 1.94 + return false; 1.95 + } 1.96 + 1.97 + /** 1.98 + * Returns true if the character is allowed to be a non-initial 1.99 + * character in unscoped names according to the rules of the XML 1.100 + * Namespaces proposed recommendation. Except for precluding 1.101 + * the colon (used to separate names from their scopes) these 1.102 + * characters are just as allowed by the XML recommendation. 1.103 + * 1.104 + * @see #isNameChar(char) 1.105 + * @see #isLetter(char) 1.106 + */ 1.107 + public static boolean isNCNameChar(char c) { 1.108 + // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' 1.109 + // | CombiningChar | Extender 1.110 + return c != ':' && isNameChar(c); 1.111 + } 1.112 + 1.113 + /** 1.114 + * Returns true if the character is allowed where XML supports 1.115 + * whitespace characters, false otherwise. 1.116 + */ 1.117 + public static boolean isSpace(char c) { 1.118 + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; 1.119 + } 1.120 + 1.121 + 1.122 + /* 1.123 + * NOTE: java.lang.Character.getType() values are: 1.124 + * 1.125 + * UNASSIGNED = 0, 1.126 + * 1.127 + * UPPERCASE_LETTER = 1, // Lu 1.128 + * LOWERCASE_LETTER = 2, // Ll 1.129 + * TITLECASE_LETTER = 3, // Lt 1.130 + * MODIFIER_LETTER = 4, // Lm 1.131 + * OTHER_LETTER = 5, // Lo 1.132 + * NON_SPACING_MARK = 6, // Mn 1.133 + * ENCLOSING_MARK = 7, // Me 1.134 + * COMBINING_SPACING_MARK = 8, // Mc 1.135 + * DECIMAL_DIGIT_NUMBER = 9, // Nd 1.136 + * LETTER_NUMBER = 10, // Nl 1.137 + * OTHER_NUMBER = 11, // No 1.138 + * SPACE_SEPARATOR = 12, // Zs 1.139 + * LINE_SEPARATOR = 13, // Zl 1.140 + * PARAGRAPH_SEPARATOR = 14, // Zp 1.141 + * CONTROL = 15, // Cc 1.142 + * FORMAT = 16, // Cf 1.143 + * // 17 reserved for proposed Ci category 1.144 + * PRIVATE_USE = 18, // Co 1.145 + * SURROGATE = 19, // Cs 1.146 + * DASH_PUNCTUATION = 20, // Pd 1.147 + * START_PUNCTUATION = 21, // Ps 1.148 + * END_PUNCTUATION = 22, // Pe 1.149 + * CONNECTOR_PUNCTUATION = 23, // Pc 1.150 + * OTHER_PUNCTUATION = 24, // Po 1.151 + * MATH_SYMBOL = 25, // Sm 1.152 + * CURRENCY_SYMBOL = 26, // Sc 1.153 + * MODIFIER_SYMBOL = 27, // Sk 1.154 + * OTHER_SYMBOL = 28; // So 1.155 + */ 1.156 + 1.157 + /** 1.158 + * Returns true if the character is an XML "letter". XML Names must 1.159 + * start with Letters or a few other characters, but other characters 1.160 + * in names must only satisfy the <em>isNameChar</em> predicate. 1.161 + * 1.162 + * @see #isNameChar(char) 1.163 + * @see #isNCNameChar(char) 1.164 + */ 1.165 + public static boolean isLetter(char c) { 1.166 + // [84] Letter ::= BaseChar | Ideographic 1.167 + // [85] BaseChar ::= ... too much to repeat 1.168 + // [86] Ideographic ::= ... too much to repeat 1.169 + 1.170 + // 1.171 + // Optimize the typical case. 1.172 + // 1.173 + if (c >= 'a' && c <= 'z') 1.174 + return true; 1.175 + if (c == '/') 1.176 + return false; 1.177 + if (c >= 'A' && c <= 'Z') 1.178 + return true; 1.179 + 1.180 + // 1.181 + // Since the tables are too ridiculous to use in code, 1.182 + // we're using the footnotes here to drive this test. 1.183 + // 1.184 + switch (Character.getType(c)) { 1.185 + // app. B footnote says these are 'name start' 1.186 + // chars' ... 1.187 + case Character.LOWERCASE_LETTER: // Ll 1.188 + case Character.UPPERCASE_LETTER: // Lu 1.189 + case Character.OTHER_LETTER: // Lo 1.190 + case Character.TITLECASE_LETTER: // Lt 1.191 + case Character.LETTER_NUMBER: // Nl 1.192 + 1.193 + // OK, here we just have some exceptions to check... 1.194 + return !isCompatibilityChar(c) 1.195 + // per "5.14 of Unicode", rule out some combiners 1.196 + && !(c >= 0x20dd && c <= 0x20e0); 1.197 + 1.198 + default: 1.199 + // check for some exceptions: these are "alphabetic" 1.200 + return ((c >= 0x02bb && c <= 0x02c1) 1.201 + || c == 0x0559 || c == 0x06e5 || c == 0x06e6); 1.202 + } 1.203 + } 1.204 + 1.205 + // 1.206 + // XML 1.0 discourages "compatibility" characters in names; these 1.207 + // were defined to permit passing through some information stored in 1.208 + // older non-Unicode character sets. These always have alternative 1.209 + // representations in Unicode, e.g. using combining chars. 1.210 + // 1.211 + private static boolean isCompatibilityChar(char c) { 1.212 + // the numerous comparisions here seem unavoidable, 1.213 + // but the switch can reduce the number which must 1.214 + // actually be executed. 1.215 + 1.216 + switch ((c >> 8) & 0x0ff) { 1.217 + case 0x00: 1.218 + // ISO Latin/1 has a few compatibility characters 1.219 + return c == 0x00aa || c == 0x00b5 || c == 0x00ba; 1.220 + 1.221 + case 0x01: 1.222 + // as do Latin Extended A and (parts of) B 1.223 + return (c >= 0x0132 && c <= 0x0133) 1.224 + || (c >= 0x013f && c <= 0x0140) 1.225 + || c == 0x0149 1.226 + || c == 0x017f 1.227 + || (c >= 0x01c4 && c <= 0x01cc) 1.228 + || (c >= 0x01f1 && c <= 0x01f3); 1.229 + 1.230 + case 0x02: 1.231 + // some spacing modifiers 1.232 + return (c >= 0x02b0 && c <= 0x02b8) 1.233 + || (c >= 0x02e0 && c <= 0x02e4); 1.234 + 1.235 + case 0x03: 1.236 + return c == 0x037a; // Greek 1.237 + 1.238 + case 0x05: 1.239 + return c == 0x0587; // Armenian 1.240 + 1.241 + case 0x0e: 1.242 + return c >= 0x0edc && c <= 0x0edd; // Laotian 1.243 + 1.244 + case 0x11: 1.245 + // big chunks of Hangul Jamo are all "compatibility" 1.246 + return c == 0x1101 1.247 + || c == 0x1104 1.248 + || c == 0x1108 1.249 + || c == 0x110a 1.250 + || c == 0x110d 1.251 + || (c >= 0x1113 && c <= 0x113b) 1.252 + || c == 0x113d 1.253 + || c == 0x113f 1.254 + || (c >= 0x1141 && c <= 0x114b) 1.255 + || c == 0x114d 1.256 + || c == 0x114f 1.257 + || (c >= 0x1151 && c <= 0x1153) 1.258 + || (c >= 0x1156 && c <= 0x1158) 1.259 + || c == 0x1162 1.260 + || c == 0x1164 1.261 + || c == 0x1166 1.262 + || c == 0x1168 1.263 + || (c >= 0x116a && c <= 0x116c) 1.264 + || (c >= 0x116f && c <= 0x1171) 1.265 + || c == 0x1174 1.266 + || (c >= 0x1176 && c <= 0x119d) 1.267 + || (c >= 0x119f && c <= 0x11a2) 1.268 + || (c >= 0x11a9 && c <= 0x11aa) 1.269 + || (c >= 0x11ac && c <= 0x11ad) 1.270 + || (c >= 0x11b0 && c <= 0x11b6) 1.271 + || c == 0x11b9 1.272 + || c == 0x11bb 1.273 + || (c >= 0x11c3 && c <= 0x11ea) 1.274 + || (c >= 0x11ec && c <= 0x11ef) 1.275 + || (c >= 0x11f1 && c <= 0x11f8) 1.276 + ; 1.277 + 1.278 + case 0x20: 1.279 + return c == 0x207f; // superscript 1.280 + 1.281 + case 0x21: 1.282 + return 1.283 + // various letterlike symbols 1.284 + c == 0x2102 1.285 + || c == 0x2107 1.286 + || (c >= 0x210a && c <= 0x2113) 1.287 + || c == 0x2115 1.288 + || (c >= 0x2118 && c <= 0x211d) 1.289 + || c == 0x2124 1.290 + || c == 0x2128 1.291 + || (c >= 0x212c && c <= 0x212d) 1.292 + || (c >= 0x212f && c <= 0x2138) 1.293 + 1.294 + // most Roman numerals (less 1K, 5K, 10K) 1.295 + || (c >= 0x2160 && c <= 0x217f) 1.296 + ; 1.297 + 1.298 + case 0x30: 1.299 + // some Hiragana 1.300 + return c >= 0x309b && c <= 0x309c; 1.301 + 1.302 + case 0x31: 1.303 + // all Hangul Compatibility Jamo 1.304 + return c >= 0x3131 && c <= 0x318e; 1.305 + 1.306 + case 0xf9: 1.307 + case 0xfa: 1.308 + case 0xfb: 1.309 + case 0xfc: 1.310 + case 0xfd: 1.311 + case 0xfe: 1.312 + case 0xff: 1.313 + // the whole "compatibility" area is for that purpose! 1.314 + return true; 1.315 + 1.316 + default: 1.317 + // most of Unicode isn't flagged as being for compatibility 1.318 + return false; 1.319 + } 1.320 + } 1.321 + 1.322 + // guts of isNameChar/isNCNameChar 1.323 + private static boolean isLetter2(char c) { 1.324 + // [84] Letter ::= BaseChar | Ideographic 1.325 + // [85] BaseChar ::= ... too much to repeat 1.326 + // [86] Ideographic ::= ... too much to repeat 1.327 + // [87] CombiningChar ::= ... too much to repeat 1.328 + 1.329 + // 1.330 + // Optimize the typical case. 1.331 + // 1.332 + if (c >= 'a' && c <= 'z') 1.333 + return true; 1.334 + if (c == '>') 1.335 + return false; 1.336 + if (c >= 'A' && c <= 'Z') 1.337 + return true; 1.338 + 1.339 + // 1.340 + // Since the tables are too ridiculous to use in code, 1.341 + // we're using the footnotes here to drive this test. 1.342 + // 1.343 + switch (Character.getType(c)) { 1.344 + // app. B footnote says these are 'name start' 1.345 + // chars' ... 1.346 + case Character.LOWERCASE_LETTER: // Ll 1.347 + case Character.UPPERCASE_LETTER: // Lu 1.348 + case Character.OTHER_LETTER: // Lo 1.349 + case Character.TITLECASE_LETTER: // Lt 1.350 + case Character.LETTER_NUMBER: // Nl 1.351 + // ... and these are name characters 'other 1.352 + // than name start characters' 1.353 + case Character.COMBINING_SPACING_MARK: // Mc 1.354 + case Character.ENCLOSING_MARK: // Me 1.355 + case Character.NON_SPACING_MARK: // Mn 1.356 + case Character.MODIFIER_LETTER: // Lm 1.357 + case Character.DECIMAL_DIGIT_NUMBER: // Nd 1.358 + 1.359 + // OK, here we just have some exceptions to check... 1.360 + return !isCompatibilityChar(c) 1.361 + // per "5.14 of Unicode", rule out some combiners 1.362 + && !(c >= 0x20dd && c <= 0x20e0); 1.363 + 1.364 + default: 1.365 + // added a character ... 1.366 + return c == 0x0387; 1.367 + } 1.368 + } 1.369 + 1.370 + private static boolean isDigit(char c) { 1.371 + // [88] Digit ::= ... 1.372 + 1.373 + // 1.374 + // java.lang.Character.isDigit is correct from the XML point 1.375 + // of view except that it allows "fullwidth" digits. 1.376 + // 1.377 + return Character.isDigit(c) 1.378 + && !((c >= 0xff10) && (c <= 0xff19)); 1.379 + } 1.380 + 1.381 + private static boolean isExtender(char c) { 1.382 + // [89] Extender ::= ... 1.383 + return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 1.384 + || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 1.385 + || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) 1.386 + || (c >= 0x309d && c <= 0x309e) 1.387 + || (c >= 0x30fc && c <= 0x30fe) 1.388 + ; 1.389 + } 1.390 +}