src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java

Thu, 31 Aug 2017 15:18:52 +0800

author
aoqi
date
Thu, 31 Aug 2017 15:18:52 +0800
changeset 637
9c07ef4934dd
parent 397
b99d7e355d4b
parent 0
373ffda63c9a
permissions
-rw-r--r--

merge

     1 /*
     2  * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    26 package com.sun.xml.internal.dtdparser;
    29 /**
    30  * Methods in this class are used to determine whether characters may
    31  * appear in certain roles in XML documents.  Such methods are used
    32  * both to parse and to create such documents.
    33  *
    34  * @author David Brownell
    35  * @version 1.1, 00/08/05
    36  */
    37 public class XmlChars {
    38     // can't construct instances
    39     private XmlChars() {
    40     }
    42     /**
    43      * Returns true if the argument, a UCS-4 character code, is valid in
    44      * XML documents.  Unicode characters fit into the low sixteen
    45      * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
    46      * characters</em> can be combined to encode UCS-4 characters in
    47      * documents containing only Unicode.  (The <code>char</code> datatype
    48      * in the Java Programming Language represents Unicode characters,
    49      * including unpaired surrogates.)
    50      * <p/>
    51      * <P> In XML, UCS-4 characters can also be encoded by the use of
    52      * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
    53      * happens to refer to a character that is disallowed in XML documents.
    54      * UCS-4 characters allowed in XML documents can be expressed with
    55      * one or two Unicode characters.
    56      *
    57      * @param ucs4char The 32-bit UCS-4 character being tested.
    58      */
    59     static public boolean isChar(int ucs4char) {
    60         // [2] Char ::= #x0009 | #x000A | #x000D
    61         //            | [#x0020-#xD7FF]
    62         //    ... surrogates excluded!
    63         //            | [#xE000-#xFFFD]
    64         //             | [#x10000-#x10ffff]
    65         return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
    66                 || ucs4char == 0x000A || ucs4char == 0x0009
    67                 || ucs4char == 0x000D
    68                 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
    69                 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
    70     }
    72     /**
    73      * Returns true if the character is allowed to be a non-initial
    74      * character in names according to the XML recommendation.
    75      *
    76      * @see #isNCNameChar(char)
    77      * @see #isLetter(char)
    78      */
    79     public static boolean isNameChar(char c) {
    80         // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
    81         //            | CombiningChar | Extender
    83         if (isLetter2(c))
    84             return true;
    85         else if (c == '>')
    86             return false;
    87         else if (c == '.' || c == '-' || c == '_' || c == ':'
    88                 || isExtender(c))
    89             return true;
    90         else
    91             return false;
    92     }
    94     /**
    95      * Returns true if the character is allowed to be a non-initial
    96      * character in unscoped names according to the rules of the XML
    97      * Namespaces proposed recommendation.  Except for precluding
    98      * the colon (used to separate names from their scopes) these
    99      * characters are just as allowed by the XML recommendation.
   100      *
   101      * @see #isNameChar(char)
   102      * @see #isLetter(char)
   103      */
   104     public static boolean isNCNameChar(char c) {
   105         // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
   106         //            | CombiningChar | Extender
   107         return c != ':' && isNameChar(c);
   108     }
   110     /**
   111      * Returns true if the character is allowed where XML supports
   112      * whitespace characters, false otherwise.
   113      */
   114     public static boolean isSpace(char c) {
   115         return c == ' ' || c == '\t' || c == '\n' || c == '\r';
   116     }
   119     /*
   120      * NOTE:  java.lang.Character.getType() values are:
   121      *
   122      * UNASSIGNED                    = 0,
   123      *
   124      * UPPERCASE_LETTER            = 1,    // Lu
   125      * LOWERCASE_LETTER            = 2,    // Ll
   126      * TITLECASE_LETTER            = 3,    // Lt
   127      * MODIFIER_LETTER             = 4,    // Lm
   128      * OTHER_LETTER                = 5,    // Lo
   129      * NON_SPACING_MARK            = 6,    // Mn
   130      * ENCLOSING_MARK              = 7,    // Me
   131      * COMBINING_SPACING_MARK      = 8,    // Mc
   132      * DECIMAL_DIGIT_NUMBER        = 9,    // Nd
   133      * LETTER_NUMBER               = 10,   // Nl
   134      * OTHER_NUMBER                = 11,   // No
   135      * SPACE_SEPARATOR             = 12,   // Zs
   136      * LINE_SEPARATOR              = 13,   // Zl
   137      * PARAGRAPH_SEPARATOR         = 14,   // Zp
   138      * CONTROL                     = 15,   // Cc
   139      * FORMAT                      = 16,   // Cf
   140      *                         // 17 reserved for proposed Ci category
   141      * PRIVATE_USE                 = 18,   // Co
   142      * SURROGATE                   = 19,   // Cs
   143      * DASH_PUNCTUATION            = 20,   // Pd
   144      * START_PUNCTUATION           = 21,   // Ps
   145      * END_PUNCTUATION             = 22,   // Pe
   146      * CONNECTOR_PUNCTUATION       = 23,   // Pc
   147      * OTHER_PUNCTUATION           = 24,   // Po
   148      * MATH_SYMBOL                 = 25,   // Sm
   149      * CURRENCY_SYMBOL             = 26,   // Sc
   150      * MODIFIER_SYMBOL             = 27,   // Sk
   151      * OTHER_SYMBOL                = 28;   // So
   152      */
   154     /**
   155      * Returns true if the character is an XML "letter".  XML Names must
   156      * start with Letters or a few other characters, but other characters
   157      * in names must only satisfy the <em>isNameChar</em> predicate.
   158      *
   159      * @see #isNameChar(char)
   160      * @see #isNCNameChar(char)
   161      */
   162     public static boolean isLetter(char c) {
   163         // [84] Letter ::= BaseChar | Ideographic
   164         // [85] BaseChar ::= ... too much to repeat
   165         // [86] Ideographic ::= ... too much to repeat
   167         //
   168         // Optimize the typical case.
   169         //
   170         if (c >= 'a' && c <= 'z')
   171             return true;
   172         if (c == '/')
   173             return false;
   174         if (c >= 'A' && c <= 'Z')
   175             return true;
   177         //
   178         // Since the tables are too ridiculous to use in code,
   179         // we're using the footnotes here to drive this test.
   180         //
   181         switch (Character.getType(c)) {
   182         // app. B footnote says these are 'name start'
   183         // chars' ...
   184         case Character.LOWERCASE_LETTER:        // Ll
   185         case Character.UPPERCASE_LETTER:        // Lu
   186         case Character.OTHER_LETTER:            // Lo
   187         case Character.TITLECASE_LETTER:        // Lt
   188         case Character.LETTER_NUMBER:            // Nl
   190             // OK, here we just have some exceptions to check...
   191             return !isCompatibilityChar(c)
   192                     // per "5.14 of Unicode", rule out some combiners
   193                     && !(c >= 0x20dd && c <= 0x20e0);
   195         default:
   196             // check for some exceptions:  these are "alphabetic"
   197             return ((c >= 0x02bb && c <= 0x02c1)
   198                     || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
   199         }
   200     }
   202     //
   203     // XML 1.0 discourages "compatibility" characters in names; these
   204     // were defined to permit passing through some information stored in
   205     // older non-Unicode character sets.  These always have alternative
   206     // representations in Unicode, e.g. using combining chars.
   207     //
   208     private static boolean isCompatibilityChar(char c) {
   209         // the numerous comparisions here seem unavoidable,
   210         // but the switch can reduce the number which must
   211         // actually be executed.
   213         switch ((c >> 8) & 0x0ff) {
   214         case 0x00:
   215             // ISO Latin/1 has a few compatibility characters
   216             return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
   218         case 0x01:
   219             // as do Latin Extended A and (parts of) B
   220             return (c >= 0x0132 && c <= 0x0133)
   221                     || (c >= 0x013f && c <= 0x0140)
   222                     || c == 0x0149
   223                     || c == 0x017f
   224                     || (c >= 0x01c4 && c <= 0x01cc)
   225                     || (c >= 0x01f1 && c <= 0x01f3);
   227         case 0x02:
   228             // some spacing modifiers
   229             return (c >= 0x02b0 && c <= 0x02b8)
   230                     || (c >= 0x02e0 && c <= 0x02e4);
   232         case 0x03:
   233             return c == 0x037a;            // Greek
   235         case 0x05:
   236             return c == 0x0587;            // Armenian
   238         case 0x0e:
   239             return c >= 0x0edc && c <= 0x0edd;    // Laotian
   241         case 0x11:
   242             // big chunks of Hangul Jamo are all "compatibility"
   243             return c == 0x1101
   244                     || c == 0x1104
   245                     || c == 0x1108
   246                     || c == 0x110a
   247                     || c == 0x110d
   248                     || (c >= 0x1113 && c <= 0x113b)
   249                     || c == 0x113d
   250                     || c == 0x113f
   251                     || (c >= 0x1141 && c <= 0x114b)
   252                     || c == 0x114d
   253                     || c == 0x114f
   254                     || (c >= 0x1151 && c <= 0x1153)
   255                     || (c >= 0x1156 && c <= 0x1158)
   256                     || c == 0x1162
   257                     || c == 0x1164
   258                     || c == 0x1166
   259                     || c == 0x1168
   260                     || (c >= 0x116a && c <= 0x116c)
   261                     || (c >= 0x116f && c <= 0x1171)
   262                     || c == 0x1174
   263                     || (c >= 0x1176 && c <= 0x119d)
   264                     || (c >= 0x119f && c <= 0x11a2)
   265                     || (c >= 0x11a9 && c <= 0x11aa)
   266                     || (c >= 0x11ac && c <= 0x11ad)
   267                     || (c >= 0x11b0 && c <= 0x11b6)
   268                     || c == 0x11b9
   269                     || c == 0x11bb
   270                     || (c >= 0x11c3 && c <= 0x11ea)
   271                     || (c >= 0x11ec && c <= 0x11ef)
   272                     || (c >= 0x11f1 && c <= 0x11f8)
   273                     ;
   275         case 0x20:
   276             return c == 0x207f;            // superscript
   278         case 0x21:
   279             return
   280                     // various letterlike symbols
   281                     c == 0x2102
   282                     || c == 0x2107
   283                     || (c >= 0x210a && c <= 0x2113)
   284                     || c == 0x2115
   285                     || (c >= 0x2118 && c <= 0x211d)
   286                     || c == 0x2124
   287                     || c == 0x2128
   288                     || (c >= 0x212c && c <= 0x212d)
   289                     || (c >= 0x212f && c <= 0x2138)
   291                     // most Roman numerals (less 1K, 5K, 10K)
   292                     || (c >= 0x2160 && c <= 0x217f)
   293                     ;
   295         case 0x30:
   296             // some Hiragana
   297             return c >= 0x309b && c <= 0x309c;
   299         case 0x31:
   300             // all Hangul Compatibility Jamo
   301             return c >= 0x3131 && c <= 0x318e;
   303         case 0xf9:
   304         case 0xfa:
   305         case 0xfb:
   306         case 0xfc:
   307         case 0xfd:
   308         case 0xfe:
   309         case 0xff:
   310             // the whole "compatibility" area is for that purpose!
   311             return true;
   313         default:
   314             // most of Unicode isn't flagged as being for compatibility
   315             return false;
   316         }
   317     }
   319     // guts of isNameChar/isNCNameChar
   320     private static boolean isLetter2(char c) {
   321         // [84] Letter ::= BaseChar | Ideographic
   322         // [85] BaseChar ::= ... too much to repeat
   323         // [86] Ideographic ::= ... too much to repeat
   324         // [87] CombiningChar ::= ... too much to repeat
   326         //
   327         // Optimize the typical case.
   328         //
   329         if (c >= 'a' && c <= 'z')
   330             return true;
   331         if (c == '>')
   332             return false;
   333         if (c >= 'A' && c <= 'Z')
   334             return true;
   336         //
   337         // Since the tables are too ridiculous to use in code,
   338         // we're using the footnotes here to drive this test.
   339         //
   340         switch (Character.getType(c)) {
   341         // app. B footnote says these are 'name start'
   342         // chars' ...
   343         case Character.LOWERCASE_LETTER:        // Ll
   344         case Character.UPPERCASE_LETTER:        // Lu
   345         case Character.OTHER_LETTER:            // Lo
   346         case Character.TITLECASE_LETTER:        // Lt
   347         case Character.LETTER_NUMBER:            // Nl
   348             // ... and these are name characters 'other
   349             // than name start characters'
   350         case Character.COMBINING_SPACING_MARK:    // Mc
   351         case Character.ENCLOSING_MARK:        // Me
   352         case Character.NON_SPACING_MARK:        // Mn
   353         case Character.MODIFIER_LETTER:        // Lm
   354         case Character.DECIMAL_DIGIT_NUMBER:        // Nd
   356             // OK, here we just have some exceptions to check...
   357             return !isCompatibilityChar(c)
   358                     // per "5.14 of Unicode", rule out some combiners
   359                     && !(c >= 0x20dd && c <= 0x20e0);
   361         default:
   362             // added a character ...
   363             return c == 0x0387;
   364         }
   365     }
   367     private static boolean isDigit(char c) {
   368         // [88] Digit ::= ...
   370         //
   371         // java.lang.Character.isDigit is correct from the XML point
   372         // of view except that it allows "fullwidth" digits.
   373         //
   374         return Character.isDigit(c)
   375                 && !((c >= 0xff10) && (c <= 0xff19));
   376     }
   378     private static boolean isExtender(char c) {
   379         // [89] Extender ::= ...
   380         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
   381                 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
   382                 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
   383                 || (c >= 0x309d && c <= 0x309e)
   384                 || (c >= 0x30fc && c <= 0x30fe)
   385                 ;
   386     }
   387 }

mercurial