src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java

Thu, 31 Aug 2017 15:18:52 +0800

author
aoqi
date
Thu, 31 Aug 2017 15:18:52 +0800
changeset 637
9c07ef4934dd
parent 397
b99d7e355d4b
parent 0
373ffda63c9a
permissions
-rw-r--r--

merge

aoqi@0 1 /*
aoqi@0 2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
aoqi@0 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
aoqi@0 4 *
aoqi@0 5 * This code is free software; you can redistribute it and/or modify it
aoqi@0 6 * under the terms of the GNU General Public License version 2 only, as
aoqi@0 7 * published by the Free Software Foundation. Oracle designates this
aoqi@0 8 * particular file as subject to the "Classpath" exception as provided
aoqi@0 9 * by Oracle in the LICENSE file that accompanied this code.
aoqi@0 10 *
aoqi@0 11 * This code is distributed in the hope that it will be useful, but WITHOUT
aoqi@0 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
aoqi@0 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
aoqi@0 14 * version 2 for more details (a copy is included in the LICENSE file that
aoqi@0 15 * accompanied this code).
aoqi@0 16 *
aoqi@0 17 * You should have received a copy of the GNU General Public License version
aoqi@0 18 * 2 along with this work; if not, write to the Free Software Foundation,
aoqi@0 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
aoqi@0 20 *
aoqi@0 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
aoqi@0 22 * or visit www.oracle.com if you need additional information or have any
aoqi@0 23 * questions.
aoqi@0 24 */
aoqi@0 25
aoqi@0 26 package com.sun.xml.internal.dtdparser;
aoqi@0 27
aoqi@0 28
aoqi@0 29 /**
aoqi@0 30 * Methods in this class are used to determine whether characters may
aoqi@0 31 * appear in certain roles in XML documents. Such methods are used
aoqi@0 32 * both to parse and to create such documents.
aoqi@0 33 *
aoqi@0 34 * @author David Brownell
aoqi@0 35 * @version 1.1, 00/08/05
aoqi@0 36 */
aoqi@0 37 public class XmlChars {
aoqi@0 38 // can't construct instances
aoqi@0 39 private XmlChars() {
aoqi@0 40 }
aoqi@0 41
aoqi@0 42 /**
aoqi@0 43 * Returns true if the argument, a UCS-4 character code, is valid in
aoqi@0 44 * XML documents. Unicode characters fit into the low sixteen
aoqi@0 45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
aoqi@0 46 * characters</em> can be combined to encode UCS-4 characters in
aoqi@0 47 * documents containing only Unicode. (The <code>char</code> datatype
aoqi@0 48 * in the Java Programming Language represents Unicode characters,
aoqi@0 49 * including unpaired surrogates.)
aoqi@0 50 * <p/>
aoqi@0 51 * <P> In XML, UCS-4 characters can also be encoded by the use of
aoqi@0 52 * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
aoqi@0 53 * happens to refer to a character that is disallowed in XML documents.
aoqi@0 54 * UCS-4 characters allowed in XML documents can be expressed with
aoqi@0 55 * one or two Unicode characters.
aoqi@0 56 *
aoqi@0 57 * @param ucs4char The 32-bit UCS-4 character being tested.
aoqi@0 58 */
aoqi@0 59 static public boolean isChar(int ucs4char) {
aoqi@0 60 // [2] Char ::= #x0009 | #x000A | #x000D
aoqi@0 61 // | [#x0020-#xD7FF]
aoqi@0 62 // ... surrogates excluded!
aoqi@0 63 // | [#xE000-#xFFFD]
aoqi@0 64 // | [#x10000-#x10ffff]
aoqi@0 65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
aoqi@0 66 || ucs4char == 0x000A || ucs4char == 0x0009
aoqi@0 67 || ucs4char == 0x000D
aoqi@0 68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
aoqi@0 69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
aoqi@0 70 }
aoqi@0 71
aoqi@0 72 /**
aoqi@0 73 * Returns true if the character is allowed to be a non-initial
aoqi@0 74 * character in names according to the XML recommendation.
aoqi@0 75 *
aoqi@0 76 * @see #isNCNameChar(char)
aoqi@0 77 * @see #isLetter(char)
aoqi@0 78 */
aoqi@0 79 public static boolean isNameChar(char c) {
aoqi@0 80 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
aoqi@0 81 // | CombiningChar | Extender
aoqi@0 82
aoqi@0 83 if (isLetter2(c))
aoqi@0 84 return true;
aoqi@0 85 else if (c == '>')
aoqi@0 86 return false;
aoqi@0 87 else if (c == '.' || c == '-' || c == '_' || c == ':'
aoqi@0 88 || isExtender(c))
aoqi@0 89 return true;
aoqi@0 90 else
aoqi@0 91 return false;
aoqi@0 92 }
aoqi@0 93
aoqi@0 94 /**
aoqi@0 95 * Returns true if the character is allowed to be a non-initial
aoqi@0 96 * character in unscoped names according to the rules of the XML
aoqi@0 97 * Namespaces proposed recommendation. Except for precluding
aoqi@0 98 * the colon (used to separate names from their scopes) these
aoqi@0 99 * characters are just as allowed by the XML recommendation.
aoqi@0 100 *
aoqi@0 101 * @see #isNameChar(char)
aoqi@0 102 * @see #isLetter(char)
aoqi@0 103 */
aoqi@0 104 public static boolean isNCNameChar(char c) {
aoqi@0 105 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
aoqi@0 106 // | CombiningChar | Extender
aoqi@0 107 return c != ':' && isNameChar(c);
aoqi@0 108 }
aoqi@0 109
aoqi@0 110 /**
aoqi@0 111 * Returns true if the character is allowed where XML supports
aoqi@0 112 * whitespace characters, false otherwise.
aoqi@0 113 */
aoqi@0 114 public static boolean isSpace(char c) {
aoqi@0 115 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
aoqi@0 116 }
aoqi@0 117
aoqi@0 118
aoqi@0 119 /*
aoqi@0 120 * NOTE: java.lang.Character.getType() values are:
aoqi@0 121 *
aoqi@0 122 * UNASSIGNED = 0,
aoqi@0 123 *
aoqi@0 124 * UPPERCASE_LETTER = 1, // Lu
aoqi@0 125 * LOWERCASE_LETTER = 2, // Ll
aoqi@0 126 * TITLECASE_LETTER = 3, // Lt
aoqi@0 127 * MODIFIER_LETTER = 4, // Lm
aoqi@0 128 * OTHER_LETTER = 5, // Lo
aoqi@0 129 * NON_SPACING_MARK = 6, // Mn
aoqi@0 130 * ENCLOSING_MARK = 7, // Me
aoqi@0 131 * COMBINING_SPACING_MARK = 8, // Mc
aoqi@0 132 * DECIMAL_DIGIT_NUMBER = 9, // Nd
aoqi@0 133 * LETTER_NUMBER = 10, // Nl
aoqi@0 134 * OTHER_NUMBER = 11, // No
aoqi@0 135 * SPACE_SEPARATOR = 12, // Zs
aoqi@0 136 * LINE_SEPARATOR = 13, // Zl
aoqi@0 137 * PARAGRAPH_SEPARATOR = 14, // Zp
aoqi@0 138 * CONTROL = 15, // Cc
aoqi@0 139 * FORMAT = 16, // Cf
aoqi@0 140 * // 17 reserved for proposed Ci category
aoqi@0 141 * PRIVATE_USE = 18, // Co
aoqi@0 142 * SURROGATE = 19, // Cs
aoqi@0 143 * DASH_PUNCTUATION = 20, // Pd
aoqi@0 144 * START_PUNCTUATION = 21, // Ps
aoqi@0 145 * END_PUNCTUATION = 22, // Pe
aoqi@0 146 * CONNECTOR_PUNCTUATION = 23, // Pc
aoqi@0 147 * OTHER_PUNCTUATION = 24, // Po
aoqi@0 148 * MATH_SYMBOL = 25, // Sm
aoqi@0 149 * CURRENCY_SYMBOL = 26, // Sc
aoqi@0 150 * MODIFIER_SYMBOL = 27, // Sk
aoqi@0 151 * OTHER_SYMBOL = 28; // So
aoqi@0 152 */
aoqi@0 153
aoqi@0 154 /**
aoqi@0 155 * Returns true if the character is an XML "letter". XML Names must
aoqi@0 156 * start with Letters or a few other characters, but other characters
aoqi@0 157 * in names must only satisfy the <em>isNameChar</em> predicate.
aoqi@0 158 *
aoqi@0 159 * @see #isNameChar(char)
aoqi@0 160 * @see #isNCNameChar(char)
aoqi@0 161 */
aoqi@0 162 public static boolean isLetter(char c) {
aoqi@0 163 // [84] Letter ::= BaseChar | Ideographic
aoqi@0 164 // [85] BaseChar ::= ... too much to repeat
aoqi@0 165 // [86] Ideographic ::= ... too much to repeat
aoqi@0 166
aoqi@0 167 //
aoqi@0 168 // Optimize the typical case.
aoqi@0 169 //
aoqi@0 170 if (c >= 'a' && c <= 'z')
aoqi@0 171 return true;
aoqi@0 172 if (c == '/')
aoqi@0 173 return false;
aoqi@0 174 if (c >= 'A' && c <= 'Z')
aoqi@0 175 return true;
aoqi@0 176
aoqi@0 177 //
aoqi@0 178 // Since the tables are too ridiculous to use in code,
aoqi@0 179 // we're using the footnotes here to drive this test.
aoqi@0 180 //
aoqi@0 181 switch (Character.getType(c)) {
aoqi@0 182 // app. B footnote says these are 'name start'
aoqi@0 183 // chars' ...
aoqi@0 184 case Character.LOWERCASE_LETTER: // Ll
aoqi@0 185 case Character.UPPERCASE_LETTER: // Lu
aoqi@0 186 case Character.OTHER_LETTER: // Lo
aoqi@0 187 case Character.TITLECASE_LETTER: // Lt
aoqi@0 188 case Character.LETTER_NUMBER: // Nl
aoqi@0 189
aoqi@0 190 // OK, here we just have some exceptions to check...
aoqi@0 191 return !isCompatibilityChar(c)
aoqi@0 192 // per "5.14 of Unicode", rule out some combiners
aoqi@0 193 && !(c >= 0x20dd && c <= 0x20e0);
aoqi@0 194
aoqi@0 195 default:
aoqi@0 196 // check for some exceptions: these are "alphabetic"
aoqi@0 197 return ((c >= 0x02bb && c <= 0x02c1)
aoqi@0 198 || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
aoqi@0 199 }
aoqi@0 200 }
aoqi@0 201
aoqi@0 202 //
aoqi@0 203 // XML 1.0 discourages "compatibility" characters in names; these
aoqi@0 204 // were defined to permit passing through some information stored in
aoqi@0 205 // older non-Unicode character sets. These always have alternative
aoqi@0 206 // representations in Unicode, e.g. using combining chars.
aoqi@0 207 //
aoqi@0 208 private static boolean isCompatibilityChar(char c) {
aoqi@0 209 // the numerous comparisions here seem unavoidable,
aoqi@0 210 // but the switch can reduce the number which must
aoqi@0 211 // actually be executed.
aoqi@0 212
aoqi@0 213 switch ((c >> 8) & 0x0ff) {
aoqi@0 214 case 0x00:
aoqi@0 215 // ISO Latin/1 has a few compatibility characters
aoqi@0 216 return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
aoqi@0 217
aoqi@0 218 case 0x01:
aoqi@0 219 // as do Latin Extended A and (parts of) B
aoqi@0 220 return (c >= 0x0132 && c <= 0x0133)
aoqi@0 221 || (c >= 0x013f && c <= 0x0140)
aoqi@0 222 || c == 0x0149
aoqi@0 223 || c == 0x017f
aoqi@0 224 || (c >= 0x01c4 && c <= 0x01cc)
aoqi@0 225 || (c >= 0x01f1 && c <= 0x01f3);
aoqi@0 226
aoqi@0 227 case 0x02:
aoqi@0 228 // some spacing modifiers
aoqi@0 229 return (c >= 0x02b0 && c <= 0x02b8)
aoqi@0 230 || (c >= 0x02e0 && c <= 0x02e4);
aoqi@0 231
aoqi@0 232 case 0x03:
aoqi@0 233 return c == 0x037a; // Greek
aoqi@0 234
aoqi@0 235 case 0x05:
aoqi@0 236 return c == 0x0587; // Armenian
aoqi@0 237
aoqi@0 238 case 0x0e:
aoqi@0 239 return c >= 0x0edc && c <= 0x0edd; // Laotian
aoqi@0 240
aoqi@0 241 case 0x11:
aoqi@0 242 // big chunks of Hangul Jamo are all "compatibility"
aoqi@0 243 return c == 0x1101
aoqi@0 244 || c == 0x1104
aoqi@0 245 || c == 0x1108
aoqi@0 246 || c == 0x110a
aoqi@0 247 || c == 0x110d
aoqi@0 248 || (c >= 0x1113 && c <= 0x113b)
aoqi@0 249 || c == 0x113d
aoqi@0 250 || c == 0x113f
aoqi@0 251 || (c >= 0x1141 && c <= 0x114b)
aoqi@0 252 || c == 0x114d
aoqi@0 253 || c == 0x114f
aoqi@0 254 || (c >= 0x1151 && c <= 0x1153)
aoqi@0 255 || (c >= 0x1156 && c <= 0x1158)
aoqi@0 256 || c == 0x1162
aoqi@0 257 || c == 0x1164
aoqi@0 258 || c == 0x1166
aoqi@0 259 || c == 0x1168
aoqi@0 260 || (c >= 0x116a && c <= 0x116c)
aoqi@0 261 || (c >= 0x116f && c <= 0x1171)
aoqi@0 262 || c == 0x1174
aoqi@0 263 || (c >= 0x1176 && c <= 0x119d)
aoqi@0 264 || (c >= 0x119f && c <= 0x11a2)
aoqi@0 265 || (c >= 0x11a9 && c <= 0x11aa)
aoqi@0 266 || (c >= 0x11ac && c <= 0x11ad)
aoqi@0 267 || (c >= 0x11b0 && c <= 0x11b6)
aoqi@0 268 || c == 0x11b9
aoqi@0 269 || c == 0x11bb
aoqi@0 270 || (c >= 0x11c3 && c <= 0x11ea)
aoqi@0 271 || (c >= 0x11ec && c <= 0x11ef)
aoqi@0 272 || (c >= 0x11f1 && c <= 0x11f8)
aoqi@0 273 ;
aoqi@0 274
aoqi@0 275 case 0x20:
aoqi@0 276 return c == 0x207f; // superscript
aoqi@0 277
aoqi@0 278 case 0x21:
aoqi@0 279 return
aoqi@0 280 // various letterlike symbols
aoqi@0 281 c == 0x2102
aoqi@0 282 || c == 0x2107
aoqi@0 283 || (c >= 0x210a && c <= 0x2113)
aoqi@0 284 || c == 0x2115
aoqi@0 285 || (c >= 0x2118 && c <= 0x211d)
aoqi@0 286 || c == 0x2124
aoqi@0 287 || c == 0x2128
aoqi@0 288 || (c >= 0x212c && c <= 0x212d)
aoqi@0 289 || (c >= 0x212f && c <= 0x2138)
aoqi@0 290
aoqi@0 291 // most Roman numerals (less 1K, 5K, 10K)
aoqi@0 292 || (c >= 0x2160 && c <= 0x217f)
aoqi@0 293 ;
aoqi@0 294
aoqi@0 295 case 0x30:
aoqi@0 296 // some Hiragana
aoqi@0 297 return c >= 0x309b && c <= 0x309c;
aoqi@0 298
aoqi@0 299 case 0x31:
aoqi@0 300 // all Hangul Compatibility Jamo
aoqi@0 301 return c >= 0x3131 && c <= 0x318e;
aoqi@0 302
aoqi@0 303 case 0xf9:
aoqi@0 304 case 0xfa:
aoqi@0 305 case 0xfb:
aoqi@0 306 case 0xfc:
aoqi@0 307 case 0xfd:
aoqi@0 308 case 0xfe:
aoqi@0 309 case 0xff:
aoqi@0 310 // the whole "compatibility" area is for that purpose!
aoqi@0 311 return true;
aoqi@0 312
aoqi@0 313 default:
aoqi@0 314 // most of Unicode isn't flagged as being for compatibility
aoqi@0 315 return false;
aoqi@0 316 }
aoqi@0 317 }
aoqi@0 318
aoqi@0 319 // guts of isNameChar/isNCNameChar
aoqi@0 320 private static boolean isLetter2(char c) {
aoqi@0 321 // [84] Letter ::= BaseChar | Ideographic
aoqi@0 322 // [85] BaseChar ::= ... too much to repeat
aoqi@0 323 // [86] Ideographic ::= ... too much to repeat
aoqi@0 324 // [87] CombiningChar ::= ... too much to repeat
aoqi@0 325
aoqi@0 326 //
aoqi@0 327 // Optimize the typical case.
aoqi@0 328 //
aoqi@0 329 if (c >= 'a' && c <= 'z')
aoqi@0 330 return true;
aoqi@0 331 if (c == '>')
aoqi@0 332 return false;
aoqi@0 333 if (c >= 'A' && c <= 'Z')
aoqi@0 334 return true;
aoqi@0 335
aoqi@0 336 //
aoqi@0 337 // Since the tables are too ridiculous to use in code,
aoqi@0 338 // we're using the footnotes here to drive this test.
aoqi@0 339 //
aoqi@0 340 switch (Character.getType(c)) {
aoqi@0 341 // app. B footnote says these are 'name start'
aoqi@0 342 // chars' ...
aoqi@0 343 case Character.LOWERCASE_LETTER: // Ll
aoqi@0 344 case Character.UPPERCASE_LETTER: // Lu
aoqi@0 345 case Character.OTHER_LETTER: // Lo
aoqi@0 346 case Character.TITLECASE_LETTER: // Lt
aoqi@0 347 case Character.LETTER_NUMBER: // Nl
aoqi@0 348 // ... and these are name characters 'other
aoqi@0 349 // than name start characters'
aoqi@0 350 case Character.COMBINING_SPACING_MARK: // Mc
aoqi@0 351 case Character.ENCLOSING_MARK: // Me
aoqi@0 352 case Character.NON_SPACING_MARK: // Mn
aoqi@0 353 case Character.MODIFIER_LETTER: // Lm
aoqi@0 354 case Character.DECIMAL_DIGIT_NUMBER: // Nd
aoqi@0 355
aoqi@0 356 // OK, here we just have some exceptions to check...
aoqi@0 357 return !isCompatibilityChar(c)
aoqi@0 358 // per "5.14 of Unicode", rule out some combiners
aoqi@0 359 && !(c >= 0x20dd && c <= 0x20e0);
aoqi@0 360
aoqi@0 361 default:
aoqi@0 362 // added a character ...
aoqi@0 363 return c == 0x0387;
aoqi@0 364 }
aoqi@0 365 }
aoqi@0 366
aoqi@0 367 private static boolean isDigit(char c) {
aoqi@0 368 // [88] Digit ::= ...
aoqi@0 369
aoqi@0 370 //
aoqi@0 371 // java.lang.Character.isDigit is correct from the XML point
aoqi@0 372 // of view except that it allows "fullwidth" digits.
aoqi@0 373 //
aoqi@0 374 return Character.isDigit(c)
aoqi@0 375 && !((c >= 0xff10) && (c <= 0xff19));
aoqi@0 376 }
aoqi@0 377
aoqi@0 378 private static boolean isExtender(char c) {
aoqi@0 379 // [89] Extender ::= ...
aoqi@0 380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
aoqi@0 381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
aoqi@0 382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
aoqi@0 383 || (c >= 0x309d && c <= 0x309e)
aoqi@0 384 || (c >= 0x30fc && c <= 0x30fe)
aoqi@0 385 ;
aoqi@0 386 }
aoqi@0 387 }

mercurial