src/share/vm/utilities/utf8.cpp

Thu, 20 Nov 2008 16:56:09 -0800

author
ysr
date
Thu, 20 Nov 2008 16:56:09 -0800
changeset 888
c96030fff130
parent 435
a61af66fc99e
child 1907
c18cbe5936b8
permissions
-rw-r--r--

6684579: SoftReference processing can be made more efficient
Summary: For current soft-ref clearing policies, we can decide at marking time if a soft-reference will definitely not be cleared, postponing the decision of whether it will definitely be cleared to the final reference processing phase. This can be especially beneficial in the case of concurrent collectors where the marking is usually concurrent but reference processing is usually not.
Reviewed-by: jmasa

     1 /*
     2  * Copyright 1997-2004 Sun Microsystems, Inc.  All Rights Reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.
     8  *
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    12  * version 2 for more details (a copy is included in the LICENSE file that
    13  * accompanied this code).
    14  *
    15  * You should have received a copy of the GNU General Public License version
    16  * 2 along with this work; if not, write to the Free Software Foundation,
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    18  *
    19  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
    20  * CA 95054 USA or visit www.sun.com if you need additional information or
    21  * have any questions.
    22  *
    23  */
    25 # include "incls/_precompiled.incl"
    26 # include "incls/_utf8.cpp.incl"
    28 // Assume the utf8 string is in legal form and has been
    29 // checked in the class file parser/format checker.
    30 char* UTF8::next(const char* str, jchar* value) {
    31   unsigned const char *ptr = (const unsigned char *)str;
    32   unsigned char ch, ch2, ch3;
    33   int length = -1;              /* bad length */
    34   jchar result;
    35   switch ((ch = ptr[0]) >> 4) {
    36     default:
    37     result = ch;
    38     length = 1;
    39     break;
    41   case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
    42     /* Shouldn't happen. */
    43     break;
    45   case 0xC: case 0xD:
    46     /* 110xxxxx  10xxxxxx */
    47     if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
    48       unsigned char high_five = ch & 0x1F;
    49       unsigned char low_six = ch2 & 0x3F;
    50       result = (high_five << 6) + low_six;
    51       length = 2;
    52       break;
    53     }
    54     break;
    56   case 0xE:
    57     /* 1110xxxx 10xxxxxx 10xxxxxx */
    58     if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
    59       if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
    60         unsigned char high_four = ch & 0x0f;
    61         unsigned char mid_six = ch2 & 0x3f;
    62         unsigned char low_six = ch3 & 0x3f;
    63         result = (((high_four << 6) + mid_six) << 6) + low_six;
    64         length = 3;
    65       }
    66     }
    67     break;
    68   } /* end of switch */
    70   if (length <= 0) {
    71     *value = ptr[0];    /* default bad result; */
    72     return (char*)(ptr + 1); // make progress somehow
    73   }
    75   *value = result;
    77   // The assert is correct but the .class file is wrong
    78   // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
    79   return (char *)(ptr + length);
    80 }
    82 char* UTF8::next_character(const char* str, jint* value) {
    83   unsigned const char *ptr = (const unsigned char *)str;
    84   /* See if it's legal supplementary character:
    85      11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
    86   if (is_supplementary_character(ptr)) {
    87     *value = get_supplementary_character(ptr);
    88     return (char *)(ptr + 6);
    89   }
    90   jchar result;
    91   char* next_ch = next(str, &result);
    92   *value = result;
    93   return next_ch;
    94 }
    96 // Count bytes of the form 10xxxxxx and deduct this count
    97 // from the total byte count.  The utf8 string must be in
    98 // legal form which has been verified in the format checker.
    99 int UTF8::unicode_length(const char* str, int len) {
   100   int num_chars = len;
   101   for (int i = 0; i < len; i++) {
   102     if ((str[i] & 0xC0) == 0x80) {
   103       --num_chars;
   104     }
   105   }
   106   return num_chars;
   107 }
   109 // Count bytes of the utf8 string except those in form
   110 // 10xxxxxx which only appear in multibyte characters.
   111 // The utf8 string must be in legal form and has been
   112 // verified in the format checker.
   113 int UTF8::unicode_length(const char* str) {
   114   int num_chars = 0;
   115   for (const char* p = str; *p; p++) {
   116     if (((*p) & 0xC0) != 0x80) {
   117       num_chars++;
   118     }
   119   }
   120   return num_chars;
   121 }
   123 // Writes a jchar a utf8 and returns the end
   124 static u_char* utf8_write(u_char* base, jchar ch) {
   125   if ((ch != 0) && (ch <=0x7f)) {
   126     base[0] = (u_char) ch;
   127     return base + 1;
   128   }
   130   if (ch <= 0x7FF) {
   131     /* 11 bits or less. */
   132     unsigned char high_five = ch >> 6;
   133     unsigned char low_six = ch & 0x3F;
   134     base[0] = high_five | 0xC0; /* 110xxxxx */
   135     base[1] = low_six | 0x80;   /* 10xxxxxx */
   136     return base + 2;
   137   }
   138   /* possibly full 16 bits. */
   139   char high_four = ch >> 12;
   140   char mid_six = (ch >> 6) & 0x3F;
   141   char low_six = ch & 0x3f;
   142   base[0] = high_four | 0xE0; /* 1110xxxx */
   143   base[1] = mid_six | 0x80;   /* 10xxxxxx */
   144   base[2] = low_six | 0x80;   /* 10xxxxxx */
   145   return base + 3;
   146 }
   148 void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) {
   149   unsigned char ch;
   150   const char *ptr = (const char *)utf8_str;
   151   int index = 0;
   153   /* ASCII case loop optimization */
   154   for (; index < unicode_length; index++) {
   155     if((ch = ptr[0]) > 0x7F) { break; }
   156     unicode_str[index] = ch;
   157     ptr = (const char *)(ptr + 1);
   158   }
   160   for (; index < unicode_length; index++) {
   161     ptr = UTF8::next(ptr, &unicode_str[index]);
   162   }
   163 }
   165 // Returns NULL if 'c' it not found. This only works as long
   166 // as 'c' is an ASCII character
   167 jbyte* UTF8::strrchr(jbyte* base, int length, jbyte c) {
   168   assert(length >= 0, "sanity check");
   169   assert(c >= 0, "does not work for non-ASCII characters");
   170   // Skip backwards in string until 'c' is found or end is reached
   171   while(--length >= 0 && base[length] != c);
   172   return (length < 0) ? NULL : &base[length];
   173 }
   175 bool UTF8::equal(jbyte* base1, int length1, jbyte* base2, int length2) {
   176   // Length must be the same
   177   if (length1 != length2) return false;
   178   for (int i = 0; i < length1; i++) {
   179     if (base1[i] != base2[i]) return false;
   180   }
   181   return true;
   182 }
   184 bool UTF8::is_supplementary_character(const unsigned char* str) {
   185   return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
   186       && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
   187 }
   189 jint UTF8::get_supplementary_character(const unsigned char* str) {
   190   return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
   191                  + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
   192 }
   195 //-------------------------------------------------------------------------------------
   198 int UNICODE::utf8_size(jchar c) {
   199   if ((0x0001 <= c) && (c <= 0x007F)) return 1;
   200   if (c <= 0x07FF) return 2;
   201   return 3;
   202 }
   204 int UNICODE::utf8_length(jchar* base, int length) {
   205   int result = 0;
   206   for (int index = 0; index < length; index++) {
   207     jchar c = base[index];
   208     if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
   209     else if (c <= 0x07FF) result += 2;
   210     else result += 3;
   211   }
   212   return result;
   213 }
   215 char* UNICODE::as_utf8(jchar* base, int length) {
   216   int utf8_len = utf8_length(base, length);
   217   u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
   218   u_char* p = result;
   219   for (int index = 0; index < length; index++) {
   220     p = utf8_write(p, base[index]);
   221   }
   222   *p = '\0';
   223   assert(p == &result[utf8_len], "length prediction must be correct");
   224   return (char*) result;
   225 }
   227 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
   228   u_char* p = (u_char*)buf;
   229   u_char* end = (u_char*)buf + buflen;
   230   for (int index = 0; index < length; index++) {
   231     jchar c = base[index];
   232     if (p + utf8_size(c) >= end) break;      // string is truncated
   233     p = utf8_write(p, base[index]);
   234   }
   235   *p = '\0';
   236   return buf;
   237 }
   239 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
   240   for(int index = 0; index < length; index++) {
   241     utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
   242   }
   243   *utf8_buffer = '\0';
   244 }

mercurial