src/share/classes/com/sun/codemodel/internal/util/Surrogate.java

Sat, 01 Dec 2007 00:00:00 +0000

author
duke
date
Sat, 01 Dec 2007 00:00:00 +0000
changeset 1
0961a4a21176
child 45
31822b475baa
permissions
-rw-r--r--

Initial load

     1 /*
     2  * Copyright 2006 Sun Microsystems, Inc.  All Rights Reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Sun designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Sun in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
    22  * CA 95054 USA or visit www.sun.com if you need additional information or
    23  * have any questions.
    24  */
    26 package com.sun.codemodel.internal.util;
    28 import java.nio.CharBuffer;
    29 import java.nio.charset.CoderResult;
    32 /**
    33  * Utility class for dealing with surrogates.
    34  *
    35  * @author Mark Reinhold
    36  */
    38 class Surrogate {
    40     private Surrogate() { }
    42     // UTF-16 surrogate-character ranges
    43     //
    44     public static final char MIN_HIGH = '\uD800';
    45     public static final char MAX_HIGH = '\uDBFF';
    46     public static final char MIN_LOW  = '\uDC00';
    47     public static final char MAX_LOW  = '\uDFFF';
    48     public static final char MIN = MIN_HIGH;
    49     public static final char MAX = MAX_LOW;
    51     // Range of UCS-4 values that need surrogates in UTF-16
    52     //
    53     public static final int UCS4_MIN = 0x10000;
    54     public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
    56     /**
    57      * Tells whether or not the given UTF-16 value is a high surrogate.
    58      */
    59     public static boolean isHigh(int c) {
    60         return (MIN_HIGH <= c) && (c <= MAX_HIGH);
    61     }
    63     /**
    64      * Tells whether or not the given UTF-16 value is a low surrogate.
    65      */
    66     public static boolean isLow(int c) {
    67         return (MIN_LOW <= c) && (c <= MAX_LOW);
    68     }
    70     /**
    71      * Tells whether or not the given UTF-16 value is a surrogate character,
    72      */
    73     public static boolean is(int c) {
    74         return (MIN <= c) && (c <= MAX);
    75     }
    77     /**
    78      * Tells whether or not the given UCS-4 character must be represented as a
    79      * surrogate pair in UTF-16.
    80      */
    81     public static boolean neededFor(int uc) {
    82         return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
    83     }
    85     /**
    86      * Returns the high UTF-16 surrogate for the given UCS-4 character.
    87      */
    88     public static char high(int uc) {
    89         return (char)(0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
    90     }
    92     /**
    93      * Returns the low UTF-16 surrogate for the given UCS-4 character.
    94      */
    95     public static char low(int uc) {
    96         return (char)(0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
    97     }
    99     /**
   100      * Converts the given surrogate pair into a 32-bit UCS-4 character.
   101      */
   102     public static int toUCS4(char c, char d) {
   103         return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
   104     }
   106     /**
   107      * Surrogate parsing support.  Charset implementations may use instances of
   108      * this class to handle the details of parsing UTF-16 surrogate pairs.
   109      */
   110     public static class Parser {
   112         public Parser() { }
   114         private int character;          // UCS-4
   115         private CoderResult error = CoderResult.UNDERFLOW;
   116         private boolean isPair;
   118         /**
   119          * Returns the UCS-4 character previously parsed.
   120          */
   121         public int character() {
   122             return character;
   123         }
   125         /**
   126          * Tells whether or not the previously-parsed UCS-4 character was
   127          * originally represented by a surrogate pair.
   128          */
   129         public boolean isPair() {
   130             return isPair;
   131         }
   133         /**
   134          * Returns the number of UTF-16 characters consumed by the previous
   135          * parse.
   136          */
   137         public int increment() {
   138             return isPair ? 2 : 1;
   139         }
   141         /**
   142          * If the previous parse operation detected an error, return the object
   143          * describing that error.
   144          */
   145         public CoderResult error() {
   146             return error;
   147         }
   149         /**
   150          * Returns an unmappable-input result object, with the appropriate
   151          * input length, for the previously-parsed character.
   152          */
   153         public CoderResult unmappableResult() {
   154             return CoderResult.unmappableForLength(isPair ? 2 : 1);
   155         }
   157         /**
   158          * Parses a UCS-4 character from the given source buffer, handling
   159          * surrogates.
   160          *
   161          * @param  c    The first character
   162          * @param  in   The source buffer, from which one more character
   163          *              will be consumed if c is a high surrogate
   164          *
   165          * @return   Either a parsed UCS-4 character, in which case the isPair()
   166          *           and increment() methods will return meaningful values, or
   167          *           -1, in which case error() will return a descriptive result
   168          *           object
   169          */
   170         public int parse(char c, CharBuffer in) {
   171             if (isHigh(c)) {
   172                 if (!in.hasRemaining()) {
   173                     error = CoderResult.UNDERFLOW;
   174                     return -1;
   175                 }
   176                 char d = in.get();
   177                 if (isLow(d)) {
   178                     character = toUCS4(c, d);
   179                     isPair = true;
   180                     error = null;
   181                     return character;
   182                 }
   183                 error = CoderResult.malformedForLength(1);
   184                 return -1;
   185             }
   186             if (isLow(c)) {
   187                 error = CoderResult.malformedForLength(1);
   188                 return -1;
   189             }
   190             character = c;
   191             isPair = false;
   192             error = null;
   193             return character;
   194         }
   196         /**
   197          * Parses a UCS-4 character from the given source buffer, handling
   198          * surrogates.
   199          *
   200          * @param  c    The first character
   201          * @param  ia   The input array, from which one more character
   202          *              will be consumed if c is a high surrogate
   203          * @param  ip   The input index
   204          * @param  il   The input limit
   205          *
   206          * @return   Either a parsed UCS-4 character, in which case the isPair()
   207          *           and increment() methods will return meaningful values, or
   208          *           -1, in which case error() will return a descriptive result
   209          *           object
   210          */
   211         public int parse(char c, char[] ia, int ip, int il) {
   212             if (isHigh(c)) {
   213                 if (il - ip < 2) {
   214                     error = CoderResult.UNDERFLOW;
   215                     return -1;
   216                 }
   217                 char d = ia[ip + 1];
   218                 if (isLow(d)) {
   219                     character = toUCS4(c, d);
   220                     isPair = true;
   221                     error = null;
   222                     return character;
   223                 }
   224                 error = CoderResult.malformedForLength(1);
   225                 return -1;
   226             }
   227             if (isLow(c)) {
   228                 error = CoderResult.malformedForLength(1);
   229                 return -1;
   230             }
   231             character = c;
   232             isPair = false;
   233             error = null;
   234             return character;
   235         }
   237     }
   239     /**
   240      * Surrogate generation support.  Charset implementations may use instances
   241      * of this class to handle the details of generating UTF-16 surrogate
   242      * pairs.
   243      */
   244     public static class Generator {
   246         public Generator() { }
   248         private CoderResult error = CoderResult.OVERFLOW;
   250         /**
   251          * If the previous generation operation detected an error, return the
   252          * object describing that error.
   253          */
   254         public CoderResult error() {
   255             return error;
   256         }
   258         /**
   259          * Generates one or two UTF-16 characters to represent the given UCS-4
   260          * character.
   261          *
   262          * @param  uc   The UCS-4 character
   263          * @param  len  The number of input bytes from which the UCS-4 value
   264          *              was constructed (used when creating result objects)
   265          * @param  dst  The destination buffer, to which one or two UTF-16
   266          *              characters will be written
   267          *
   268          * @return   Either a positive count of the number of UTF-16 characters
   269          *           written to the destination buffer, or -1, in which case
   270          *           error() will return a descriptive result object
   271          */
   272         public int generate(int uc, int len, CharBuffer dst) {
   273             if (uc <= 0xffff) {
   274                 if (is(uc)) {
   275                     error = CoderResult.malformedForLength(len);
   276                     return -1;
   277                 }
   278                 if (dst.remaining() < 1) {
   279                     error = CoderResult.OVERFLOW;
   280                     return -1;
   281                 }
   282                 dst.put((char)uc);
   283                 error = null;
   284                 return 1;
   285             }
   286             if (uc < UCS4_MIN) {
   287                 error = CoderResult.malformedForLength(len);
   288                 return -1;
   289             }
   290             if (uc <= UCS4_MAX) {
   291                 if (dst.remaining() < 2) {
   292                     error = CoderResult.OVERFLOW;
   293                     return -1;
   294                 }
   295                 dst.put(high(uc));
   296                 dst.put(low(uc));
   297                 error = null;
   298                 return 2;
   299             }
   300             error = CoderResult.unmappableForLength(len);
   301             return -1;
   302         }
   304         /**
   305          * Generates one or two UTF-16 characters to represent the given UCS-4
   306          * character.
   307          *
   308          * @param  uc   The UCS-4 character
   309          * @param  len  The number of input bytes from which the UCS-4 value
   310          *              was constructed (used when creating result objects)
   311          * @param  da   The destination array, to which one or two UTF-16
   312          *              characters will be written
   313          * @param  dp   The destination position
   314          * @param  dl   The destination limit
   315          *
   316          * @return   Either a positive count of the number of UTF-16 characters
   317          *           written to the destination buffer, or -1, in which case
   318          *           error() will return a descriptive result object
   319          */
   320         public int generate(int uc, int len, char[] da, int dp, int dl) {
   321             if (uc <= 0xffff) {
   322                 if (is(uc)) {
   323                     error = CoderResult.malformedForLength(len);
   324                     return -1;
   325                 }
   326                 if (dl - dp < 1) {
   327                     error = CoderResult.OVERFLOW;
   328                     return -1;
   329                 }
   330                 da[dp] = (char)uc;
   331                 error = null;
   332                 return 1;
   333             }
   334             if (uc < UCS4_MIN) {
   335                 error = CoderResult.malformedForLength(len);
   336                 return -1;
   337             }
   338             if (uc <= UCS4_MAX) {
   339                 if (dl - dp < 2) {
   340                     error = CoderResult.OVERFLOW;
   341                     return -1;
   342                 }
   343                 da[dp] = high(uc);
   344                 da[dp + 1] = low(uc);
   345                 error = null;
   346                 return 2;
   347             }
   348             error = CoderResult.unmappableForLength(len);
   349             return -1;
   350         }
   352     }
   354 }

mercurial