Tue, 06 Mar 2012 16:09:35 -0800
7150322: Stop using drop source bundles in jaxws
Reviewed-by: darcy, ohrstrom
ohair@286 | 1 | /* |
ohair@286 | 2 | * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. |
ohair@286 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
ohair@286 | 4 | * |
ohair@286 | 5 | * This code is free software; you can redistribute it and/or modify it |
ohair@286 | 6 | * under the terms of the GNU General Public License version 2 only, as |
ohair@286 | 7 | * published by the Free Software Foundation. Oracle designates this |
ohair@286 | 8 | * particular file as subject to the "Classpath" exception as provided |
ohair@286 | 9 | * by Oracle in the LICENSE file that accompanied this code. |
ohair@286 | 10 | * |
ohair@286 | 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
ohair@286 | 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
ohair@286 | 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
ohair@286 | 14 | * version 2 for more details (a copy is included in the LICENSE file that |
ohair@286 | 15 | * accompanied this code). |
ohair@286 | 16 | * |
ohair@286 | 17 | * You should have received a copy of the GNU General Public License version |
ohair@286 | 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
ohair@286 | 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
ohair@286 | 20 | * |
ohair@286 | 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
ohair@286 | 22 | * or visit www.oracle.com if you need additional information or have any |
ohair@286 | 23 | * questions. |
ohair@286 | 24 | */ |
ohair@286 | 25 | |
ohair@286 | 26 | package com.sun.xml.internal.dtdparser; |
ohair@286 | 27 | |
ohair@286 | 28 | |
ohair@286 | 29 | /** |
ohair@286 | 30 | * Methods in this class are used to determine whether characters may |
ohair@286 | 31 | * appear in certain roles in XML documents. Such methods are used |
ohair@286 | 32 | * both to parse and to create such documents. |
ohair@286 | 33 | * |
ohair@286 | 34 | * @author David Brownell |
ohair@286 | 35 | * @version 1.1, 00/08/05 |
ohair@286 | 36 | */ |
ohair@286 | 37 | public class XmlChars { |
ohair@286 | 38 | // can't construct instances |
ohair@286 | 39 | private XmlChars() { |
ohair@286 | 40 | } |
ohair@286 | 41 | |
ohair@286 | 42 | /** |
ohair@286 | 43 | * Returns true if the argument, a UCS-4 character code, is valid in |
ohair@286 | 44 | * XML documents. Unicode characters fit into the low sixteen |
ohair@286 | 45 | * bits of a UCS-4 character, and pairs of Unicode <em>surrogate |
ohair@286 | 46 | * characters</em> can be combined to encode UCS-4 characters in |
ohair@286 | 47 | * documents containing only Unicode. (The <code>char</code> datatype |
ohair@286 | 48 | * in the Java Programming Language represents Unicode characters, |
ohair@286 | 49 | * including unpaired surrogates.) |
ohair@286 | 50 | * <p/> |
ohair@286 | 51 | * <P> In XML, UCS-4 characters can also be encoded by the use of |
ohair@286 | 52 | * <em>character references</em> such as <b>&#x12345678;</b>, which |
ohair@286 | 53 | * happens to refer to a character that is disallowed in XML documents. |
ohair@286 | 54 | * UCS-4 characters allowed in XML documents can be expressed with |
ohair@286 | 55 | * one or two Unicode characters. |
ohair@286 | 56 | * |
ohair@286 | 57 | * @param ucs4char The 32-bit UCS-4 character being tested. |
ohair@286 | 58 | */ |
ohair@286 | 59 | static public boolean isChar(int ucs4char) { |
ohair@286 | 60 | // [2] Char ::= #x0009 | #x000A | #x000D |
ohair@286 | 61 | // | [#x0020-#xD7FF] |
ohair@286 | 62 | // ... surrogates excluded! |
ohair@286 | 63 | // | [#xE000-#xFFFD] |
ohair@286 | 64 | // | [#x10000-#x10ffff] |
ohair@286 | 65 | return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) |
ohair@286 | 66 | || ucs4char == 0x000A || ucs4char == 0x0009 |
ohair@286 | 67 | || ucs4char == 0x000D |
ohair@286 | 68 | || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) |
ohair@286 | 69 | || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); |
ohair@286 | 70 | } |
ohair@286 | 71 | |
ohair@286 | 72 | /** |
ohair@286 | 73 | * Returns true if the character is allowed to be a non-initial |
ohair@286 | 74 | * character in names according to the XML recommendation. |
ohair@286 | 75 | * |
ohair@286 | 76 | * @see #isNCNameChar(char) |
ohair@286 | 77 | * @see #isLetter(char) |
ohair@286 | 78 | */ |
ohair@286 | 79 | public static boolean isNameChar(char c) { |
ohair@286 | 80 | // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' |
ohair@286 | 81 | // | CombiningChar | Extender |
ohair@286 | 82 | |
ohair@286 | 83 | if (isLetter2(c)) |
ohair@286 | 84 | return true; |
ohair@286 | 85 | else if (c == '>') |
ohair@286 | 86 | return false; |
ohair@286 | 87 | else if (c == '.' || c == '-' || c == '_' || c == ':' |
ohair@286 | 88 | || isExtender(c)) |
ohair@286 | 89 | return true; |
ohair@286 | 90 | else |
ohair@286 | 91 | return false; |
ohair@286 | 92 | } |
ohair@286 | 93 | |
ohair@286 | 94 | /** |
ohair@286 | 95 | * Returns true if the character is allowed to be a non-initial |
ohair@286 | 96 | * character in unscoped names according to the rules of the XML |
ohair@286 | 97 | * Namespaces proposed recommendation. Except for precluding |
ohair@286 | 98 | * the colon (used to separate names from their scopes) these |
ohair@286 | 99 | * characters are just as allowed by the XML recommendation. |
ohair@286 | 100 | * |
ohair@286 | 101 | * @see #isNameChar(char) |
ohair@286 | 102 | * @see #isLetter(char) |
ohair@286 | 103 | */ |
ohair@286 | 104 | public static boolean isNCNameChar(char c) { |
ohair@286 | 105 | // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' |
ohair@286 | 106 | // | CombiningChar | Extender |
ohair@286 | 107 | return c != ':' && isNameChar(c); |
ohair@286 | 108 | } |
ohair@286 | 109 | |
ohair@286 | 110 | /** |
ohair@286 | 111 | * Returns true if the character is allowed where XML supports |
ohair@286 | 112 | * whitespace characters, false otherwise. |
ohair@286 | 113 | */ |
ohair@286 | 114 | public static boolean isSpace(char c) { |
ohair@286 | 115 | return c == ' ' || c == '\t' || c == '\n' || c == '\r'; |
ohair@286 | 116 | } |
ohair@286 | 117 | |
ohair@286 | 118 | |
ohair@286 | 119 | /* |
ohair@286 | 120 | * NOTE: java.lang.Character.getType() values are: |
ohair@286 | 121 | * |
ohair@286 | 122 | * UNASSIGNED = 0, |
ohair@286 | 123 | * |
ohair@286 | 124 | * UPPERCASE_LETTER = 1, // Lu |
ohair@286 | 125 | * LOWERCASE_LETTER = 2, // Ll |
ohair@286 | 126 | * TITLECASE_LETTER = 3, // Lt |
ohair@286 | 127 | * MODIFIER_LETTER = 4, // Lm |
ohair@286 | 128 | * OTHER_LETTER = 5, // Lo |
ohair@286 | 129 | * NON_SPACING_MARK = 6, // Mn |
ohair@286 | 130 | * ENCLOSING_MARK = 7, // Me |
ohair@286 | 131 | * COMBINING_SPACING_MARK = 8, // Mc |
ohair@286 | 132 | * DECIMAL_DIGIT_NUMBER = 9, // Nd |
ohair@286 | 133 | * LETTER_NUMBER = 10, // Nl |
ohair@286 | 134 | * OTHER_NUMBER = 11, // No |
ohair@286 | 135 | * SPACE_SEPARATOR = 12, // Zs |
ohair@286 | 136 | * LINE_SEPARATOR = 13, // Zl |
ohair@286 | 137 | * PARAGRAPH_SEPARATOR = 14, // Zp |
ohair@286 | 138 | * CONTROL = 15, // Cc |
ohair@286 | 139 | * FORMAT = 16, // Cf |
ohair@286 | 140 | * // 17 reserved for proposed Ci category |
ohair@286 | 141 | * PRIVATE_USE = 18, // Co |
ohair@286 | 142 | * SURROGATE = 19, // Cs |
ohair@286 | 143 | * DASH_PUNCTUATION = 20, // Pd |
ohair@286 | 144 | * START_PUNCTUATION = 21, // Ps |
ohair@286 | 145 | * END_PUNCTUATION = 22, // Pe |
ohair@286 | 146 | * CONNECTOR_PUNCTUATION = 23, // Pc |
ohair@286 | 147 | * OTHER_PUNCTUATION = 24, // Po |
ohair@286 | 148 | * MATH_SYMBOL = 25, // Sm |
ohair@286 | 149 | * CURRENCY_SYMBOL = 26, // Sc |
ohair@286 | 150 | * MODIFIER_SYMBOL = 27, // Sk |
ohair@286 | 151 | * OTHER_SYMBOL = 28; // So |
ohair@286 | 152 | */ |
ohair@286 | 153 | |
ohair@286 | 154 | /** |
ohair@286 | 155 | * Returns true if the character is an XML "letter". XML Names must |
ohair@286 | 156 | * start with Letters or a few other characters, but other characters |
ohair@286 | 157 | * in names must only satisfy the <em>isNameChar</em> predicate. |
ohair@286 | 158 | * |
ohair@286 | 159 | * @see #isNameChar(char) |
ohair@286 | 160 | * @see #isNCNameChar(char) |
ohair@286 | 161 | */ |
ohair@286 | 162 | public static boolean isLetter(char c) { |
ohair@286 | 163 | // [84] Letter ::= BaseChar | Ideographic |
ohair@286 | 164 | // [85] BaseChar ::= ... too much to repeat |
ohair@286 | 165 | // [86] Ideographic ::= ... too much to repeat |
ohair@286 | 166 | |
ohair@286 | 167 | // |
ohair@286 | 168 | // Optimize the typical case. |
ohair@286 | 169 | // |
ohair@286 | 170 | if (c >= 'a' && c <= 'z') |
ohair@286 | 171 | return true; |
ohair@286 | 172 | if (c == '/') |
ohair@286 | 173 | return false; |
ohair@286 | 174 | if (c >= 'A' && c <= 'Z') |
ohair@286 | 175 | return true; |
ohair@286 | 176 | |
ohair@286 | 177 | // |
ohair@286 | 178 | // Since the tables are too ridiculous to use in code, |
ohair@286 | 179 | // we're using the footnotes here to drive this test. |
ohair@286 | 180 | // |
ohair@286 | 181 | switch (Character.getType(c)) { |
ohair@286 | 182 | // app. B footnote says these are 'name start' |
ohair@286 | 183 | // chars' ... |
ohair@286 | 184 | case Character.LOWERCASE_LETTER: // Ll |
ohair@286 | 185 | case Character.UPPERCASE_LETTER: // Lu |
ohair@286 | 186 | case Character.OTHER_LETTER: // Lo |
ohair@286 | 187 | case Character.TITLECASE_LETTER: // Lt |
ohair@286 | 188 | case Character.LETTER_NUMBER: // Nl |
ohair@286 | 189 | |
ohair@286 | 190 | // OK, here we just have some exceptions to check... |
ohair@286 | 191 | return !isCompatibilityChar(c) |
ohair@286 | 192 | // per "5.14 of Unicode", rule out some combiners |
ohair@286 | 193 | && !(c >= 0x20dd && c <= 0x20e0); |
ohair@286 | 194 | |
ohair@286 | 195 | default: |
ohair@286 | 196 | // check for some exceptions: these are "alphabetic" |
ohair@286 | 197 | return ((c >= 0x02bb && c <= 0x02c1) |
ohair@286 | 198 | || c == 0x0559 || c == 0x06e5 || c == 0x06e6); |
ohair@286 | 199 | } |
ohair@286 | 200 | } |
ohair@286 | 201 | |
ohair@286 | 202 | // |
ohair@286 | 203 | // XML 1.0 discourages "compatibility" characters in names; these |
ohair@286 | 204 | // were defined to permit passing through some information stored in |
ohair@286 | 205 | // older non-Unicode character sets. These always have alternative |
ohair@286 | 206 | // representations in Unicode, e.g. using combining chars. |
ohair@286 | 207 | // |
ohair@286 | 208 | private static boolean isCompatibilityChar(char c) { |
ohair@286 | 209 | // the numerous comparisions here seem unavoidable, |
ohair@286 | 210 | // but the switch can reduce the number which must |
ohair@286 | 211 | // actually be executed. |
ohair@286 | 212 | |
ohair@286 | 213 | switch ((c >> 8) & 0x0ff) { |
ohair@286 | 214 | case 0x00: |
ohair@286 | 215 | // ISO Latin/1 has a few compatibility characters |
ohair@286 | 216 | return c == 0x00aa || c == 0x00b5 || c == 0x00ba; |
ohair@286 | 217 | |
ohair@286 | 218 | case 0x01: |
ohair@286 | 219 | // as do Latin Extended A and (parts of) B |
ohair@286 | 220 | return (c >= 0x0132 && c <= 0x0133) |
ohair@286 | 221 | || (c >= 0x013f && c <= 0x0140) |
ohair@286 | 222 | || c == 0x0149 |
ohair@286 | 223 | || c == 0x017f |
ohair@286 | 224 | || (c >= 0x01c4 && c <= 0x01cc) |
ohair@286 | 225 | || (c >= 0x01f1 && c <= 0x01f3); |
ohair@286 | 226 | |
ohair@286 | 227 | case 0x02: |
ohair@286 | 228 | // some spacing modifiers |
ohair@286 | 229 | return (c >= 0x02b0 && c <= 0x02b8) |
ohair@286 | 230 | || (c >= 0x02e0 && c <= 0x02e4); |
ohair@286 | 231 | |
ohair@286 | 232 | case 0x03: |
ohair@286 | 233 | return c == 0x037a; // Greek |
ohair@286 | 234 | |
ohair@286 | 235 | case 0x05: |
ohair@286 | 236 | return c == 0x0587; // Armenian |
ohair@286 | 237 | |
ohair@286 | 238 | case 0x0e: |
ohair@286 | 239 | return c >= 0x0edc && c <= 0x0edd; // Laotian |
ohair@286 | 240 | |
ohair@286 | 241 | case 0x11: |
ohair@286 | 242 | // big chunks of Hangul Jamo are all "compatibility" |
ohair@286 | 243 | return c == 0x1101 |
ohair@286 | 244 | || c == 0x1104 |
ohair@286 | 245 | || c == 0x1108 |
ohair@286 | 246 | || c == 0x110a |
ohair@286 | 247 | || c == 0x110d |
ohair@286 | 248 | || (c >= 0x1113 && c <= 0x113b) |
ohair@286 | 249 | || c == 0x113d |
ohair@286 | 250 | || c == 0x113f |
ohair@286 | 251 | || (c >= 0x1141 && c <= 0x114b) |
ohair@286 | 252 | || c == 0x114d |
ohair@286 | 253 | || c == 0x114f |
ohair@286 | 254 | || (c >= 0x1151 && c <= 0x1153) |
ohair@286 | 255 | || (c >= 0x1156 && c <= 0x1158) |
ohair@286 | 256 | || c == 0x1162 |
ohair@286 | 257 | || c == 0x1164 |
ohair@286 | 258 | || c == 0x1166 |
ohair@286 | 259 | || c == 0x1168 |
ohair@286 | 260 | || (c >= 0x116a && c <= 0x116c) |
ohair@286 | 261 | || (c >= 0x116f && c <= 0x1171) |
ohair@286 | 262 | || c == 0x1174 |
ohair@286 | 263 | || (c >= 0x1176 && c <= 0x119d) |
ohair@286 | 264 | || (c >= 0x119f && c <= 0x11a2) |
ohair@286 | 265 | || (c >= 0x11a9 && c <= 0x11aa) |
ohair@286 | 266 | || (c >= 0x11ac && c <= 0x11ad) |
ohair@286 | 267 | || (c >= 0x11b0 && c <= 0x11b6) |
ohair@286 | 268 | || c == 0x11b9 |
ohair@286 | 269 | || c == 0x11bb |
ohair@286 | 270 | || (c >= 0x11c3 && c <= 0x11ea) |
ohair@286 | 271 | || (c >= 0x11ec && c <= 0x11ef) |
ohair@286 | 272 | || (c >= 0x11f1 && c <= 0x11f8) |
ohair@286 | 273 | ; |
ohair@286 | 274 | |
ohair@286 | 275 | case 0x20: |
ohair@286 | 276 | return c == 0x207f; // superscript |
ohair@286 | 277 | |
ohair@286 | 278 | case 0x21: |
ohair@286 | 279 | return |
ohair@286 | 280 | // various letterlike symbols |
ohair@286 | 281 | c == 0x2102 |
ohair@286 | 282 | || c == 0x2107 |
ohair@286 | 283 | || (c >= 0x210a && c <= 0x2113) |
ohair@286 | 284 | || c == 0x2115 |
ohair@286 | 285 | || (c >= 0x2118 && c <= 0x211d) |
ohair@286 | 286 | || c == 0x2124 |
ohair@286 | 287 | || c == 0x2128 |
ohair@286 | 288 | || (c >= 0x212c && c <= 0x212d) |
ohair@286 | 289 | || (c >= 0x212f && c <= 0x2138) |
ohair@286 | 290 | |
ohair@286 | 291 | // most Roman numerals (less 1K, 5K, 10K) |
ohair@286 | 292 | || (c >= 0x2160 && c <= 0x217f) |
ohair@286 | 293 | ; |
ohair@286 | 294 | |
ohair@286 | 295 | case 0x30: |
ohair@286 | 296 | // some Hiragana |
ohair@286 | 297 | return c >= 0x309b && c <= 0x309c; |
ohair@286 | 298 | |
ohair@286 | 299 | case 0x31: |
ohair@286 | 300 | // all Hangul Compatibility Jamo |
ohair@286 | 301 | return c >= 0x3131 && c <= 0x318e; |
ohair@286 | 302 | |
ohair@286 | 303 | case 0xf9: |
ohair@286 | 304 | case 0xfa: |
ohair@286 | 305 | case 0xfb: |
ohair@286 | 306 | case 0xfc: |
ohair@286 | 307 | case 0xfd: |
ohair@286 | 308 | case 0xfe: |
ohair@286 | 309 | case 0xff: |
ohair@286 | 310 | // the whole "compatibility" area is for that purpose! |
ohair@286 | 311 | return true; |
ohair@286 | 312 | |
ohair@286 | 313 | default: |
ohair@286 | 314 | // most of Unicode isn't flagged as being for compatibility |
ohair@286 | 315 | return false; |
ohair@286 | 316 | } |
ohair@286 | 317 | } |
ohair@286 | 318 | |
ohair@286 | 319 | // guts of isNameChar/isNCNameChar |
ohair@286 | 320 | private static boolean isLetter2(char c) { |
ohair@286 | 321 | // [84] Letter ::= BaseChar | Ideographic |
ohair@286 | 322 | // [85] BaseChar ::= ... too much to repeat |
ohair@286 | 323 | // [86] Ideographic ::= ... too much to repeat |
ohair@286 | 324 | // [87] CombiningChar ::= ... too much to repeat |
ohair@286 | 325 | |
ohair@286 | 326 | // |
ohair@286 | 327 | // Optimize the typical case. |
ohair@286 | 328 | // |
ohair@286 | 329 | if (c >= 'a' && c <= 'z') |
ohair@286 | 330 | return true; |
ohair@286 | 331 | if (c == '>') |
ohair@286 | 332 | return false; |
ohair@286 | 333 | if (c >= 'A' && c <= 'Z') |
ohair@286 | 334 | return true; |
ohair@286 | 335 | |
ohair@286 | 336 | // |
ohair@286 | 337 | // Since the tables are too ridiculous to use in code, |
ohair@286 | 338 | // we're using the footnotes here to drive this test. |
ohair@286 | 339 | // |
ohair@286 | 340 | switch (Character.getType(c)) { |
ohair@286 | 341 | // app. B footnote says these are 'name start' |
ohair@286 | 342 | // chars' ... |
ohair@286 | 343 | case Character.LOWERCASE_LETTER: // Ll |
ohair@286 | 344 | case Character.UPPERCASE_LETTER: // Lu |
ohair@286 | 345 | case Character.OTHER_LETTER: // Lo |
ohair@286 | 346 | case Character.TITLECASE_LETTER: // Lt |
ohair@286 | 347 | case Character.LETTER_NUMBER: // Nl |
ohair@286 | 348 | // ... and these are name characters 'other |
ohair@286 | 349 | // than name start characters' |
ohair@286 | 350 | case Character.COMBINING_SPACING_MARK: // Mc |
ohair@286 | 351 | case Character.ENCLOSING_MARK: // Me |
ohair@286 | 352 | case Character.NON_SPACING_MARK: // Mn |
ohair@286 | 353 | case Character.MODIFIER_LETTER: // Lm |
ohair@286 | 354 | case Character.DECIMAL_DIGIT_NUMBER: // Nd |
ohair@286 | 355 | |
ohair@286 | 356 | // OK, here we just have some exceptions to check... |
ohair@286 | 357 | return !isCompatibilityChar(c) |
ohair@286 | 358 | // per "5.14 of Unicode", rule out some combiners |
ohair@286 | 359 | && !(c >= 0x20dd && c <= 0x20e0); |
ohair@286 | 360 | |
ohair@286 | 361 | default: |
ohair@286 | 362 | // added a character ... |
ohair@286 | 363 | return c == 0x0387; |
ohair@286 | 364 | } |
ohair@286 | 365 | } |
ohair@286 | 366 | |
ohair@286 | 367 | private static boolean isDigit(char c) { |
ohair@286 | 368 | // [88] Digit ::= ... |
ohair@286 | 369 | |
ohair@286 | 370 | // |
ohair@286 | 371 | // java.lang.Character.isDigit is correct from the XML point |
ohair@286 | 372 | // of view except that it allows "fullwidth" digits. |
ohair@286 | 373 | // |
ohair@286 | 374 | return Character.isDigit(c) |
ohair@286 | 375 | && !((c >= 0xff10) && (c <= 0xff19)); |
ohair@286 | 376 | } |
ohair@286 | 377 | |
ohair@286 | 378 | private static boolean isExtender(char c) { |
ohair@286 | 379 | // [89] Extender ::= ... |
ohair@286 | 380 | return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 |
ohair@286 | 381 | || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 |
ohair@286 | 382 | || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) |
ohair@286 | 383 | || (c >= 0x309d && c <= 0x309e) |
ohair@286 | 384 | || (c >= 0x30fc && c <= 0x30fe) |
ohair@286 | 385 | ; |
ohair@286 | 386 | } |
ohair@286 | 387 | } |