src/share/vm/utilities/utf8.cpp

Thu, 20 Nov 2008 16:56:09 -0800

author
ysr
date
Thu, 20 Nov 2008 16:56:09 -0800
changeset 888
c96030fff130
parent 435
a61af66fc99e
child 1907
c18cbe5936b8
permissions
-rw-r--r--

6684579: SoftReference processing can be made more efficient
Summary: For current soft-ref clearing policies, we can decide at marking time if a soft-reference will definitely not be cleared, postponing the decision of whether it will definitely be cleared to the final reference processing phase. This can be especially beneficial in the case of concurrent collectors where the marking is usually concurrent but reference processing is usually not.
Reviewed-by: jmasa

duke@435 1 /*
duke@435 2 * Copyright 1997-2004 Sun Microsystems, Inc. All Rights Reserved.
duke@435 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
duke@435 4 *
duke@435 5 * This code is free software; you can redistribute it and/or modify it
duke@435 6 * under the terms of the GNU General Public License version 2 only, as
duke@435 7 * published by the Free Software Foundation.
duke@435 8 *
duke@435 9 * This code is distributed in the hope that it will be useful, but WITHOUT
duke@435 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
duke@435 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
duke@435 12 * version 2 for more details (a copy is included in the LICENSE file that
duke@435 13 * accompanied this code).
duke@435 14 *
duke@435 15 * You should have received a copy of the GNU General Public License version
duke@435 16 * 2 along with this work; if not, write to the Free Software Foundation,
duke@435 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
duke@435 18 *
duke@435 19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
duke@435 20 * CA 95054 USA or visit www.sun.com if you need additional information or
duke@435 21 * have any questions.
duke@435 22 *
duke@435 23 */
duke@435 24
duke@435 25 # include "incls/_precompiled.incl"
duke@435 26 # include "incls/_utf8.cpp.incl"
duke@435 27
duke@435 28 // Assume the utf8 string is in legal form and has been
duke@435 29 // checked in the class file parser/format checker.
duke@435 30 char* UTF8::next(const char* str, jchar* value) {
duke@435 31 unsigned const char *ptr = (const unsigned char *)str;
duke@435 32 unsigned char ch, ch2, ch3;
duke@435 33 int length = -1; /* bad length */
duke@435 34 jchar result;
duke@435 35 switch ((ch = ptr[0]) >> 4) {
duke@435 36 default:
duke@435 37 result = ch;
duke@435 38 length = 1;
duke@435 39 break;
duke@435 40
duke@435 41 case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
duke@435 42 /* Shouldn't happen. */
duke@435 43 break;
duke@435 44
duke@435 45 case 0xC: case 0xD:
duke@435 46 /* 110xxxxx 10xxxxxx */
duke@435 47 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
duke@435 48 unsigned char high_five = ch & 0x1F;
duke@435 49 unsigned char low_six = ch2 & 0x3F;
duke@435 50 result = (high_five << 6) + low_six;
duke@435 51 length = 2;
duke@435 52 break;
duke@435 53 }
duke@435 54 break;
duke@435 55
duke@435 56 case 0xE:
duke@435 57 /* 1110xxxx 10xxxxxx 10xxxxxx */
duke@435 58 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
duke@435 59 if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
duke@435 60 unsigned char high_four = ch & 0x0f;
duke@435 61 unsigned char mid_six = ch2 & 0x3f;
duke@435 62 unsigned char low_six = ch3 & 0x3f;
duke@435 63 result = (((high_four << 6) + mid_six) << 6) + low_six;
duke@435 64 length = 3;
duke@435 65 }
duke@435 66 }
duke@435 67 break;
duke@435 68 } /* end of switch */
duke@435 69
duke@435 70 if (length <= 0) {
duke@435 71 *value = ptr[0]; /* default bad result; */
duke@435 72 return (char*)(ptr + 1); // make progress somehow
duke@435 73 }
duke@435 74
duke@435 75 *value = result;
duke@435 76
duke@435 77 // The assert is correct but the .class file is wrong
duke@435 78 // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
duke@435 79 return (char *)(ptr + length);
duke@435 80 }
duke@435 81
duke@435 82 char* UTF8::next_character(const char* str, jint* value) {
duke@435 83 unsigned const char *ptr = (const unsigned char *)str;
duke@435 84 /* See if it's legal supplementary character:
duke@435 85 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
duke@435 86 if (is_supplementary_character(ptr)) {
duke@435 87 *value = get_supplementary_character(ptr);
duke@435 88 return (char *)(ptr + 6);
duke@435 89 }
duke@435 90 jchar result;
duke@435 91 char* next_ch = next(str, &result);
duke@435 92 *value = result;
duke@435 93 return next_ch;
duke@435 94 }
duke@435 95
duke@435 96 // Count bytes of the form 10xxxxxx and deduct this count
duke@435 97 // from the total byte count. The utf8 string must be in
duke@435 98 // legal form which has been verified in the format checker.
duke@435 99 int UTF8::unicode_length(const char* str, int len) {
duke@435 100 int num_chars = len;
duke@435 101 for (int i = 0; i < len; i++) {
duke@435 102 if ((str[i] & 0xC0) == 0x80) {
duke@435 103 --num_chars;
duke@435 104 }
duke@435 105 }
duke@435 106 return num_chars;
duke@435 107 }
duke@435 108
duke@435 109 // Count bytes of the utf8 string except those in form
duke@435 110 // 10xxxxxx which only appear in multibyte characters.
duke@435 111 // The utf8 string must be in legal form and has been
duke@435 112 // verified in the format checker.
duke@435 113 int UTF8::unicode_length(const char* str) {
duke@435 114 int num_chars = 0;
duke@435 115 for (const char* p = str; *p; p++) {
duke@435 116 if (((*p) & 0xC0) != 0x80) {
duke@435 117 num_chars++;
duke@435 118 }
duke@435 119 }
duke@435 120 return num_chars;
duke@435 121 }
duke@435 122
duke@435 123 // Writes a jchar a utf8 and returns the end
duke@435 124 static u_char* utf8_write(u_char* base, jchar ch) {
duke@435 125 if ((ch != 0) && (ch <=0x7f)) {
duke@435 126 base[0] = (u_char) ch;
duke@435 127 return base + 1;
duke@435 128 }
duke@435 129
duke@435 130 if (ch <= 0x7FF) {
duke@435 131 /* 11 bits or less. */
duke@435 132 unsigned char high_five = ch >> 6;
duke@435 133 unsigned char low_six = ch & 0x3F;
duke@435 134 base[0] = high_five | 0xC0; /* 110xxxxx */
duke@435 135 base[1] = low_six | 0x80; /* 10xxxxxx */
duke@435 136 return base + 2;
duke@435 137 }
duke@435 138 /* possibly full 16 bits. */
duke@435 139 char high_four = ch >> 12;
duke@435 140 char mid_six = (ch >> 6) & 0x3F;
duke@435 141 char low_six = ch & 0x3f;
duke@435 142 base[0] = high_four | 0xE0; /* 1110xxxx */
duke@435 143 base[1] = mid_six | 0x80; /* 10xxxxxx */
duke@435 144 base[2] = low_six | 0x80; /* 10xxxxxx */
duke@435 145 return base + 3;
duke@435 146 }
duke@435 147
duke@435 148 void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) {
duke@435 149 unsigned char ch;
duke@435 150 const char *ptr = (const char *)utf8_str;
duke@435 151 int index = 0;
duke@435 152
duke@435 153 /* ASCII case loop optimization */
duke@435 154 for (; index < unicode_length; index++) {
duke@435 155 if((ch = ptr[0]) > 0x7F) { break; }
duke@435 156 unicode_str[index] = ch;
duke@435 157 ptr = (const char *)(ptr + 1);
duke@435 158 }
duke@435 159
duke@435 160 for (; index < unicode_length; index++) {
duke@435 161 ptr = UTF8::next(ptr, &unicode_str[index]);
duke@435 162 }
duke@435 163 }
duke@435 164
duke@435 165 // Returns NULL if 'c' it not found. This only works as long
duke@435 166 // as 'c' is an ASCII character
duke@435 167 jbyte* UTF8::strrchr(jbyte* base, int length, jbyte c) {
duke@435 168 assert(length >= 0, "sanity check");
duke@435 169 assert(c >= 0, "does not work for non-ASCII characters");
duke@435 170 // Skip backwards in string until 'c' is found or end is reached
duke@435 171 while(--length >= 0 && base[length] != c);
duke@435 172 return (length < 0) ? NULL : &base[length];
duke@435 173 }
duke@435 174
duke@435 175 bool UTF8::equal(jbyte* base1, int length1, jbyte* base2, int length2) {
duke@435 176 // Length must be the same
duke@435 177 if (length1 != length2) return false;
duke@435 178 for (int i = 0; i < length1; i++) {
duke@435 179 if (base1[i] != base2[i]) return false;
duke@435 180 }
duke@435 181 return true;
duke@435 182 }
duke@435 183
duke@435 184 bool UTF8::is_supplementary_character(const unsigned char* str) {
duke@435 185 return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
duke@435 186 && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
duke@435 187 }
duke@435 188
duke@435 189 jint UTF8::get_supplementary_character(const unsigned char* str) {
duke@435 190 return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
duke@435 191 + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f);
duke@435 192 }
duke@435 193
duke@435 194
duke@435 195 //-------------------------------------------------------------------------------------
duke@435 196
duke@435 197
duke@435 198 int UNICODE::utf8_size(jchar c) {
duke@435 199 if ((0x0001 <= c) && (c <= 0x007F)) return 1;
duke@435 200 if (c <= 0x07FF) return 2;
duke@435 201 return 3;
duke@435 202 }
duke@435 203
duke@435 204 int UNICODE::utf8_length(jchar* base, int length) {
duke@435 205 int result = 0;
duke@435 206 for (int index = 0; index < length; index++) {
duke@435 207 jchar c = base[index];
duke@435 208 if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
duke@435 209 else if (c <= 0x07FF) result += 2;
duke@435 210 else result += 3;
duke@435 211 }
duke@435 212 return result;
duke@435 213 }
duke@435 214
duke@435 215 char* UNICODE::as_utf8(jchar* base, int length) {
duke@435 216 int utf8_len = utf8_length(base, length);
duke@435 217 u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
duke@435 218 u_char* p = result;
duke@435 219 for (int index = 0; index < length; index++) {
duke@435 220 p = utf8_write(p, base[index]);
duke@435 221 }
duke@435 222 *p = '\0';
duke@435 223 assert(p == &result[utf8_len], "length prediction must be correct");
duke@435 224 return (char*) result;
duke@435 225 }
duke@435 226
duke@435 227 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
duke@435 228 u_char* p = (u_char*)buf;
duke@435 229 u_char* end = (u_char*)buf + buflen;
duke@435 230 for (int index = 0; index < length; index++) {
duke@435 231 jchar c = base[index];
duke@435 232 if (p + utf8_size(c) >= end) break; // string is truncated
duke@435 233 p = utf8_write(p, base[index]);
duke@435 234 }
duke@435 235 *p = '\0';
duke@435 236 return buf;
duke@435 237 }
duke@435 238
duke@435 239 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
duke@435 240 for(int index = 0; index < length; index++) {
duke@435 241 utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
duke@435 242 }
duke@435 243 *utf8_buffer = '\0';
duke@435 244 }

mercurial