src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java

Tue, 06 Mar 2012 16:09:35 -0800

author
ohair
date
Tue, 06 Mar 2012 16:09:35 -0800
changeset 286
f50545b5e2f1
child 397
b99d7e355d4b
permissions
-rw-r--r--

7150322: Stop using drop source bundles in jaxws
Reviewed-by: darcy, ohrstrom

ohair@286 1 /*
ohair@286 2 * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
ohair@286 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
ohair@286 4 *
ohair@286 5 * This code is free software; you can redistribute it and/or modify it
ohair@286 6 * under the terms of the GNU General Public License version 2 only, as
ohair@286 7 * published by the Free Software Foundation. Oracle designates this
ohair@286 8 * particular file as subject to the "Classpath" exception as provided
ohair@286 9 * by Oracle in the LICENSE file that accompanied this code.
ohair@286 10 *
ohair@286 11 * This code is distributed in the hope that it will be useful, but WITHOUT
ohair@286 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
ohair@286 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
ohair@286 14 * version 2 for more details (a copy is included in the LICENSE file that
ohair@286 15 * accompanied this code).
ohair@286 16 *
ohair@286 17 * You should have received a copy of the GNU General Public License version
ohair@286 18 * 2 along with this work; if not, write to the Free Software Foundation,
ohair@286 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
ohair@286 20 *
ohair@286 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
ohair@286 22 * or visit www.oracle.com if you need additional information or have any
ohair@286 23 * questions.
ohair@286 24 */
ohair@286 25
ohair@286 26 package com.sun.xml.internal.dtdparser;
ohair@286 27
ohair@286 28
ohair@286 29 /**
ohair@286 30 * Methods in this class are used to determine whether characters may
ohair@286 31 * appear in certain roles in XML documents. Such methods are used
ohair@286 32 * both to parse and to create such documents.
ohair@286 33 *
ohair@286 34 * @author David Brownell
ohair@286 35 * @version 1.1, 00/08/05
ohair@286 36 */
ohair@286 37 public class XmlChars {
ohair@286 38 // can't construct instances
ohair@286 39 private XmlChars() {
ohair@286 40 }
ohair@286 41
ohair@286 42 /**
ohair@286 43 * Returns true if the argument, a UCS-4 character code, is valid in
ohair@286 44 * XML documents. Unicode characters fit into the low sixteen
ohair@286 45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
ohair@286 46 * characters</em> can be combined to encode UCS-4 characters in
ohair@286 47 * documents containing only Unicode. (The <code>char</code> datatype
ohair@286 48 * in the Java Programming Language represents Unicode characters,
ohair@286 49 * including unpaired surrogates.)
ohair@286 50 * <p/>
ohair@286 51 * <P> In XML, UCS-4 characters can also be encoded by the use of
ohair@286 52 * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
ohair@286 53 * happens to refer to a character that is disallowed in XML documents.
ohair@286 54 * UCS-4 characters allowed in XML documents can be expressed with
ohair@286 55 * one or two Unicode characters.
ohair@286 56 *
ohair@286 57 * @param ucs4char The 32-bit UCS-4 character being tested.
ohair@286 58 */
ohair@286 59 static public boolean isChar(int ucs4char) {
ohair@286 60 // [2] Char ::= #x0009 | #x000A | #x000D
ohair@286 61 // | [#x0020-#xD7FF]
ohair@286 62 // ... surrogates excluded!
ohair@286 63 // | [#xE000-#xFFFD]
ohair@286 64 // | [#x10000-#x10ffff]
ohair@286 65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
ohair@286 66 || ucs4char == 0x000A || ucs4char == 0x0009
ohair@286 67 || ucs4char == 0x000D
ohair@286 68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
ohair@286 69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
ohair@286 70 }
ohair@286 71
ohair@286 72 /**
ohair@286 73 * Returns true if the character is allowed to be a non-initial
ohair@286 74 * character in names according to the XML recommendation.
ohair@286 75 *
ohair@286 76 * @see #isNCNameChar(char)
ohair@286 77 * @see #isLetter(char)
ohair@286 78 */
ohair@286 79 public static boolean isNameChar(char c) {
ohair@286 80 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
ohair@286 81 // | CombiningChar | Extender
ohair@286 82
ohair@286 83 if (isLetter2(c))
ohair@286 84 return true;
ohair@286 85 else if (c == '>')
ohair@286 86 return false;
ohair@286 87 else if (c == '.' || c == '-' || c == '_' || c == ':'
ohair@286 88 || isExtender(c))
ohair@286 89 return true;
ohair@286 90 else
ohair@286 91 return false;
ohair@286 92 }
ohair@286 93
ohair@286 94 /**
ohair@286 95 * Returns true if the character is allowed to be a non-initial
ohair@286 96 * character in unscoped names according to the rules of the XML
ohair@286 97 * Namespaces proposed recommendation. Except for precluding
ohair@286 98 * the colon (used to separate names from their scopes) these
ohair@286 99 * characters are just as allowed by the XML recommendation.
ohair@286 100 *
ohair@286 101 * @see #isNameChar(char)
ohair@286 102 * @see #isLetter(char)
ohair@286 103 */
ohair@286 104 public static boolean isNCNameChar(char c) {
ohair@286 105 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
ohair@286 106 // | CombiningChar | Extender
ohair@286 107 return c != ':' && isNameChar(c);
ohair@286 108 }
ohair@286 109
ohair@286 110 /**
ohair@286 111 * Returns true if the character is allowed where XML supports
ohair@286 112 * whitespace characters, false otherwise.
ohair@286 113 */
ohair@286 114 public static boolean isSpace(char c) {
ohair@286 115 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
ohair@286 116 }
ohair@286 117
ohair@286 118
ohair@286 119 /*
ohair@286 120 * NOTE: java.lang.Character.getType() values are:
ohair@286 121 *
ohair@286 122 * UNASSIGNED = 0,
ohair@286 123 *
ohair@286 124 * UPPERCASE_LETTER = 1, // Lu
ohair@286 125 * LOWERCASE_LETTER = 2, // Ll
ohair@286 126 * TITLECASE_LETTER = 3, // Lt
ohair@286 127 * MODIFIER_LETTER = 4, // Lm
ohair@286 128 * OTHER_LETTER = 5, // Lo
ohair@286 129 * NON_SPACING_MARK = 6, // Mn
ohair@286 130 * ENCLOSING_MARK = 7, // Me
ohair@286 131 * COMBINING_SPACING_MARK = 8, // Mc
ohair@286 132 * DECIMAL_DIGIT_NUMBER = 9, // Nd
ohair@286 133 * LETTER_NUMBER = 10, // Nl
ohair@286 134 * OTHER_NUMBER = 11, // No
ohair@286 135 * SPACE_SEPARATOR = 12, // Zs
ohair@286 136 * LINE_SEPARATOR = 13, // Zl
ohair@286 137 * PARAGRAPH_SEPARATOR = 14, // Zp
ohair@286 138 * CONTROL = 15, // Cc
ohair@286 139 * FORMAT = 16, // Cf
ohair@286 140 * // 17 reserved for proposed Ci category
ohair@286 141 * PRIVATE_USE = 18, // Co
ohair@286 142 * SURROGATE = 19, // Cs
ohair@286 143 * DASH_PUNCTUATION = 20, // Pd
ohair@286 144 * START_PUNCTUATION = 21, // Ps
ohair@286 145 * END_PUNCTUATION = 22, // Pe
ohair@286 146 * CONNECTOR_PUNCTUATION = 23, // Pc
ohair@286 147 * OTHER_PUNCTUATION = 24, // Po
ohair@286 148 * MATH_SYMBOL = 25, // Sm
ohair@286 149 * CURRENCY_SYMBOL = 26, // Sc
ohair@286 150 * MODIFIER_SYMBOL = 27, // Sk
ohair@286 151 * OTHER_SYMBOL = 28; // So
ohair@286 152 */
ohair@286 153
ohair@286 154 /**
ohair@286 155 * Returns true if the character is an XML "letter". XML Names must
ohair@286 156 * start with Letters or a few other characters, but other characters
ohair@286 157 * in names must only satisfy the <em>isNameChar</em> predicate.
ohair@286 158 *
ohair@286 159 * @see #isNameChar(char)
ohair@286 160 * @see #isNCNameChar(char)
ohair@286 161 */
ohair@286 162 public static boolean isLetter(char c) {
ohair@286 163 // [84] Letter ::= BaseChar | Ideographic
ohair@286 164 // [85] BaseChar ::= ... too much to repeat
ohair@286 165 // [86] Ideographic ::= ... too much to repeat
ohair@286 166
ohair@286 167 //
ohair@286 168 // Optimize the typical case.
ohair@286 169 //
ohair@286 170 if (c >= 'a' && c <= 'z')
ohair@286 171 return true;
ohair@286 172 if (c == '/')
ohair@286 173 return false;
ohair@286 174 if (c >= 'A' && c <= 'Z')
ohair@286 175 return true;
ohair@286 176
ohair@286 177 //
ohair@286 178 // Since the tables are too ridiculous to use in code,
ohair@286 179 // we're using the footnotes here to drive this test.
ohair@286 180 //
ohair@286 181 switch (Character.getType(c)) {
ohair@286 182 // app. B footnote says these are 'name start'
ohair@286 183 // chars' ...
ohair@286 184 case Character.LOWERCASE_LETTER: // Ll
ohair@286 185 case Character.UPPERCASE_LETTER: // Lu
ohair@286 186 case Character.OTHER_LETTER: // Lo
ohair@286 187 case Character.TITLECASE_LETTER: // Lt
ohair@286 188 case Character.LETTER_NUMBER: // Nl
ohair@286 189
ohair@286 190 // OK, here we just have some exceptions to check...
ohair@286 191 return !isCompatibilityChar(c)
ohair@286 192 // per "5.14 of Unicode", rule out some combiners
ohair@286 193 && !(c >= 0x20dd && c <= 0x20e0);
ohair@286 194
ohair@286 195 default:
ohair@286 196 // check for some exceptions: these are "alphabetic"
ohair@286 197 return ((c >= 0x02bb && c <= 0x02c1)
ohair@286 198 || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
ohair@286 199 }
ohair@286 200 }
ohair@286 201
ohair@286 202 //
ohair@286 203 // XML 1.0 discourages "compatibility" characters in names; these
ohair@286 204 // were defined to permit passing through some information stored in
ohair@286 205 // older non-Unicode character sets. These always have alternative
ohair@286 206 // representations in Unicode, e.g. using combining chars.
ohair@286 207 //
ohair@286 208 private static boolean isCompatibilityChar(char c) {
ohair@286 209 // the numerous comparisions here seem unavoidable,
ohair@286 210 // but the switch can reduce the number which must
ohair@286 211 // actually be executed.
ohair@286 212
ohair@286 213 switch ((c >> 8) & 0x0ff) {
ohair@286 214 case 0x00:
ohair@286 215 // ISO Latin/1 has a few compatibility characters
ohair@286 216 return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
ohair@286 217
ohair@286 218 case 0x01:
ohair@286 219 // as do Latin Extended A and (parts of) B
ohair@286 220 return (c >= 0x0132 && c <= 0x0133)
ohair@286 221 || (c >= 0x013f && c <= 0x0140)
ohair@286 222 || c == 0x0149
ohair@286 223 || c == 0x017f
ohair@286 224 || (c >= 0x01c4 && c <= 0x01cc)
ohair@286 225 || (c >= 0x01f1 && c <= 0x01f3);
ohair@286 226
ohair@286 227 case 0x02:
ohair@286 228 // some spacing modifiers
ohair@286 229 return (c >= 0x02b0 && c <= 0x02b8)
ohair@286 230 || (c >= 0x02e0 && c <= 0x02e4);
ohair@286 231
ohair@286 232 case 0x03:
ohair@286 233 return c == 0x037a; // Greek
ohair@286 234
ohair@286 235 case 0x05:
ohair@286 236 return c == 0x0587; // Armenian
ohair@286 237
ohair@286 238 case 0x0e:
ohair@286 239 return c >= 0x0edc && c <= 0x0edd; // Laotian
ohair@286 240
ohair@286 241 case 0x11:
ohair@286 242 // big chunks of Hangul Jamo are all "compatibility"
ohair@286 243 return c == 0x1101
ohair@286 244 || c == 0x1104
ohair@286 245 || c == 0x1108
ohair@286 246 || c == 0x110a
ohair@286 247 || c == 0x110d
ohair@286 248 || (c >= 0x1113 && c <= 0x113b)
ohair@286 249 || c == 0x113d
ohair@286 250 || c == 0x113f
ohair@286 251 || (c >= 0x1141 && c <= 0x114b)
ohair@286 252 || c == 0x114d
ohair@286 253 || c == 0x114f
ohair@286 254 || (c >= 0x1151 && c <= 0x1153)
ohair@286 255 || (c >= 0x1156 && c <= 0x1158)
ohair@286 256 || c == 0x1162
ohair@286 257 || c == 0x1164
ohair@286 258 || c == 0x1166
ohair@286 259 || c == 0x1168
ohair@286 260 || (c >= 0x116a && c <= 0x116c)
ohair@286 261 || (c >= 0x116f && c <= 0x1171)
ohair@286 262 || c == 0x1174
ohair@286 263 || (c >= 0x1176 && c <= 0x119d)
ohair@286 264 || (c >= 0x119f && c <= 0x11a2)
ohair@286 265 || (c >= 0x11a9 && c <= 0x11aa)
ohair@286 266 || (c >= 0x11ac && c <= 0x11ad)
ohair@286 267 || (c >= 0x11b0 && c <= 0x11b6)
ohair@286 268 || c == 0x11b9
ohair@286 269 || c == 0x11bb
ohair@286 270 || (c >= 0x11c3 && c <= 0x11ea)
ohair@286 271 || (c >= 0x11ec && c <= 0x11ef)
ohair@286 272 || (c >= 0x11f1 && c <= 0x11f8)
ohair@286 273 ;
ohair@286 274
ohair@286 275 case 0x20:
ohair@286 276 return c == 0x207f; // superscript
ohair@286 277
ohair@286 278 case 0x21:
ohair@286 279 return
ohair@286 280 // various letterlike symbols
ohair@286 281 c == 0x2102
ohair@286 282 || c == 0x2107
ohair@286 283 || (c >= 0x210a && c <= 0x2113)
ohair@286 284 || c == 0x2115
ohair@286 285 || (c >= 0x2118 && c <= 0x211d)
ohair@286 286 || c == 0x2124
ohair@286 287 || c == 0x2128
ohair@286 288 || (c >= 0x212c && c <= 0x212d)
ohair@286 289 || (c >= 0x212f && c <= 0x2138)
ohair@286 290
ohair@286 291 // most Roman numerals (less 1K, 5K, 10K)
ohair@286 292 || (c >= 0x2160 && c <= 0x217f)
ohair@286 293 ;
ohair@286 294
ohair@286 295 case 0x30:
ohair@286 296 // some Hiragana
ohair@286 297 return c >= 0x309b && c <= 0x309c;
ohair@286 298
ohair@286 299 case 0x31:
ohair@286 300 // all Hangul Compatibility Jamo
ohair@286 301 return c >= 0x3131 && c <= 0x318e;
ohair@286 302
ohair@286 303 case 0xf9:
ohair@286 304 case 0xfa:
ohair@286 305 case 0xfb:
ohair@286 306 case 0xfc:
ohair@286 307 case 0xfd:
ohair@286 308 case 0xfe:
ohair@286 309 case 0xff:
ohair@286 310 // the whole "compatibility" area is for that purpose!
ohair@286 311 return true;
ohair@286 312
ohair@286 313 default:
ohair@286 314 // most of Unicode isn't flagged as being for compatibility
ohair@286 315 return false;
ohair@286 316 }
ohair@286 317 }
ohair@286 318
ohair@286 319 // guts of isNameChar/isNCNameChar
ohair@286 320 private static boolean isLetter2(char c) {
ohair@286 321 // [84] Letter ::= BaseChar | Ideographic
ohair@286 322 // [85] BaseChar ::= ... too much to repeat
ohair@286 323 // [86] Ideographic ::= ... too much to repeat
ohair@286 324 // [87] CombiningChar ::= ... too much to repeat
ohair@286 325
ohair@286 326 //
ohair@286 327 // Optimize the typical case.
ohair@286 328 //
ohair@286 329 if (c >= 'a' && c <= 'z')
ohair@286 330 return true;
ohair@286 331 if (c == '>')
ohair@286 332 return false;
ohair@286 333 if (c >= 'A' && c <= 'Z')
ohair@286 334 return true;
ohair@286 335
ohair@286 336 //
ohair@286 337 // Since the tables are too ridiculous to use in code,
ohair@286 338 // we're using the footnotes here to drive this test.
ohair@286 339 //
ohair@286 340 switch (Character.getType(c)) {
ohair@286 341 // app. B footnote says these are 'name start'
ohair@286 342 // chars' ...
ohair@286 343 case Character.LOWERCASE_LETTER: // Ll
ohair@286 344 case Character.UPPERCASE_LETTER: // Lu
ohair@286 345 case Character.OTHER_LETTER: // Lo
ohair@286 346 case Character.TITLECASE_LETTER: // Lt
ohair@286 347 case Character.LETTER_NUMBER: // Nl
ohair@286 348 // ... and these are name characters 'other
ohair@286 349 // than name start characters'
ohair@286 350 case Character.COMBINING_SPACING_MARK: // Mc
ohair@286 351 case Character.ENCLOSING_MARK: // Me
ohair@286 352 case Character.NON_SPACING_MARK: // Mn
ohair@286 353 case Character.MODIFIER_LETTER: // Lm
ohair@286 354 case Character.DECIMAL_DIGIT_NUMBER: // Nd
ohair@286 355
ohair@286 356 // OK, here we just have some exceptions to check...
ohair@286 357 return !isCompatibilityChar(c)
ohair@286 358 // per "5.14 of Unicode", rule out some combiners
ohair@286 359 && !(c >= 0x20dd && c <= 0x20e0);
ohair@286 360
ohair@286 361 default:
ohair@286 362 // added a character ...
ohair@286 363 return c == 0x0387;
ohair@286 364 }
ohair@286 365 }
ohair@286 366
ohair@286 367 private static boolean isDigit(char c) {
ohair@286 368 // [88] Digit ::= ...
ohair@286 369
ohair@286 370 //
ohair@286 371 // java.lang.Character.isDigit is correct from the XML point
ohair@286 372 // of view except that it allows "fullwidth" digits.
ohair@286 373 //
ohair@286 374 return Character.isDigit(c)
ohair@286 375 && !((c >= 0xff10) && (c <= 0xff19));
ohair@286 376 }
ohair@286 377
ohair@286 378 private static boolean isExtender(char c) {
ohair@286 379 // [89] Extender ::= ...
ohair@286 380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
ohair@286 381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
ohair@286 382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
ohair@286 383 || (c >= 0x309d && c <= 0x309e)
ohair@286 384 || (c >= 0x30fc && c <= 0x30fe)
ohair@286 385 ;
ohair@286 386 }
ohair@286 387 }

mercurial