src/share/vm/utilities/utf8.cpp

Mon, 12 Nov 2012 14:03:53 -0800

author
minqi
date
Mon, 12 Nov 2012 14:03:53 -0800
changeset 4267
bd7a7ce2e264
parent 2708
1d1603768966
child 4851
8c03fc47511d
permissions
-rw-r--r--

6830717: replay of compilations would help with debugging
Summary: When java process crashed in compiler thread, repeat the compilation process will help finding root cause. This is done with using SA dump application class data and replay data from core dump, then use debug version of jvm to recompile the problematic java method.
Reviewed-by: kvn, twisti, sspitsyn
Contributed-by: yumin.qi@oracle.com

     1 /*
     2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.
     8  *
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    12  * version 2 for more details (a copy is included in the LICENSE file that
    13  * accompanied this code).
    14  *
    15  * You should have received a copy of the GNU General Public License version
    16  * 2 along with this work; if not, write to the Free Software Foundation,
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    18  *
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    20  * or visit www.oracle.com if you need additional information or have any
    21  * questions.
    22  *
    23  */
    25 #include "precompiled.hpp"
    26 #include "utilities/utf8.hpp"
    28 // Assume the utf8 string is in legal form and has been
    29 // checked in the class file parser/format checker.
    30 char* UTF8::next(const char* str, jchar* value) {
    31   unsigned const char *ptr = (const unsigned char *)str;
    32   unsigned char ch, ch2, ch3;
    33   int length = -1;              /* bad length */
    34   jchar result;
    35   switch ((ch = ptr[0]) >> 4) {
    36     default:
    37     result = ch;
    38     length = 1;
    39     break;
    41   case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
    42     /* Shouldn't happen. */
    43     break;
    45   case 0xC: case 0xD:
    46     /* 110xxxxx  10xxxxxx */
    47     if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
    48       unsigned char high_five = ch & 0x1F;
    49       unsigned char low_six = ch2 & 0x3F;
    50       result = (high_five << 6) + low_six;
    51       length = 2;
    52       break;
    53     }
    54     break;
    56   case 0xE:
    57     /* 1110xxxx 10xxxxxx 10xxxxxx */
    58     if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
    59       if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
    60         unsigned char high_four = ch & 0x0f;
    61         unsigned char mid_six = ch2 & 0x3f;
    62         unsigned char low_six = ch3 & 0x3f;
    63         result = (((high_four << 6) + mid_six) << 6) + low_six;
    64         length = 3;
    65       }
    66     }
    67     break;
    68   } /* end of switch */
    70   if (length <= 0) {
    71     *value = ptr[0];    /* default bad result; */
    72     return (char*)(ptr + 1); // make progress somehow
    73   }
    75   *value = result;
    77   // The assert is correct but the .class file is wrong
    78   // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
    79   return (char *)(ptr + length);
    80 }
    82 char* UTF8::next_character(const char* str, jint* value) {
    83   unsigned const char *ptr = (const unsigned char *)str;
    84   /* See if it's legal supplementary character:
    85      11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
    86   if (is_supplementary_character(ptr)) {
    87     *value = get_supplementary_character(ptr);
    88     return (char *)(ptr + 6);
    89   }
    90   jchar result;
    91   char* next_ch = next(str, &result);
    92   *value = result;
    93   return next_ch;
    94 }
    96 // Count bytes of the form 10xxxxxx and deduct this count
    97 // from the total byte count.  The utf8 string must be in
    98 // legal form which has been verified in the format checker.
    99 int UTF8::unicode_length(const char* str, int len) {
   100   int num_chars = len;
   101   for (int i = 0; i < len; i++) {
   102     if ((str[i] & 0xC0) == 0x80) {
   103       --num_chars;
   104     }
   105   }
   106   return num_chars;
   107 }
   109 // Count bytes of the utf8 string except those in form
   110 // 10xxxxxx which only appear in multibyte characters.
   111 // The utf8 string must be in legal form and has been
   112 // verified in the format checker.
   113 int UTF8::unicode_length(const char* str) {
   114   int num_chars = 0;
   115   for (const char* p = str; *p; p++) {
   116     if (((*p) & 0xC0) != 0x80) {
   117       num_chars++;
   118     }
   119   }
   120   return num_chars;
   121 }
   123 // Writes a jchar a utf8 and returns the end
   124 static u_char* utf8_write(u_char* base, jchar ch) {
   125   if ((ch != 0) && (ch <=0x7f)) {
   126     base[0] = (u_char) ch;
   127     return base + 1;
   128   }
   130   if (ch <= 0x7FF) {
   131     /* 11 bits or less. */
   132     unsigned char high_five = ch >> 6;
   133     unsigned char low_six = ch & 0x3F;
   134     base[0] = high_five | 0xC0; /* 110xxxxx */
   135     base[1] = low_six | 0x80;   /* 10xxxxxx */
   136     return base + 2;
   137   }
   138   /* possibly full 16 bits. */
   139   char high_four = ch >> 12;
   140   char mid_six = (ch >> 6) & 0x3F;
   141   char low_six = ch & 0x3f;
   142   base[0] = high_four | 0xE0; /* 1110xxxx */
   143   base[1] = mid_six | 0x80;   /* 10xxxxxx */
   144   base[2] = low_six | 0x80;   /* 10xxxxxx */
   145   return base + 3;
   146 }
   148 void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) {
   149   unsigned char ch;
   150   const char *ptr = utf8_str;
   151   int index = 0;
   153   /* ASCII case loop optimization */
   154   for (; index < unicode_length; index++) {
   155     if((ch = ptr[0]) > 0x7F) { break; }
   156     unicode_str[index] = ch;
   157     ptr = (const char *)(ptr + 1);
   158   }
   160   for (; index < unicode_length; index++) {
   161     ptr = UTF8::next(ptr, &unicode_str[index]);
   162   }
   163 }
   165 // returns the quoted ascii length of a 0-terminated utf8 string
   166 int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
   167   const char *ptr = utf8_str;
   168   const char* end = ptr + utf8_length;
   169   int result = 0;
   170   while (ptr < end) {
   171     jchar c;
   172     ptr = UTF8::next(ptr, &c);
   173     if (c >= 32 && c < 127) {
   174       result++;
   175     } else {
   176       result += 6;
   177     }
   178   }
   179   return result;
   180 }
   182 // converts a utf8 string to quoted ascii
   183 void UTF8::as_quoted_ascii(const char* utf8_str, char* buf, int buflen) {
   184   const char *ptr = utf8_str;
   185   char* p = buf;
   186   char* end = buf + buflen;
   187   while (*ptr != '\0') {
   188     jchar c;
   189     ptr = UTF8::next(ptr, &c);
   190     if (c >= 32 && c < 127) {
   191       if (p + 1 >= end) break;      // string is truncated
   192       *p++ = (char)c;
   193     } else {
   194       if (p + 6 >= end) break;      // string is truncated
   195       sprintf(p, "\\u%04x", c);
   196       p += 6;
   197     }
   198   }
   199   *p = '\0';
   200 }
   203 const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
   204   const char *ptr = quoted_ascii_str;
   205   char* result = NULL;
   206   while (*ptr != '\0') {
   207     char c = *ptr;
   208     if (c < 32 || c >= 127) break;
   209   }
   210   if (*ptr == '\0') {
   211     // nothing to do so return original string
   212     return quoted_ascii_str;
   213   }
   214   // everything up to this point was ok.
   215   int length = ptr - quoted_ascii_str;
   216   char* buffer = NULL;
   217   for (int round = 0; round < 2; round++) {
   218     while (*ptr != '\0') {
   219       if (*ptr != '\\') {
   220         if (buffer != NULL) {
   221           buffer[length] = *ptr;
   222         }
   223         length++;
   224       } else {
   225         switch (ptr[1]) {
   226           case 'u': {
   227             ptr += 2;
   228             jchar value=0;
   229             for (int i=0; i<4; i++) {
   230               char c = *ptr++;
   231               switch (c) {
   232                 case '0': case '1': case '2': case '3': case '4':
   233                 case '5': case '6': case '7': case '8': case '9':
   234                   value = (value << 4) + c - '0';
   235                   break;
   236                 case 'a': case 'b': case 'c':
   237                 case 'd': case 'e': case 'f':
   238                   value = (value << 4) + 10 + c - 'a';
   239                   break;
   240                 case 'A': case 'B': case 'C':
   241                 case 'D': case 'E': case 'F':
   242                   value = (value << 4) + 10 + c - 'A';
   243                   break;
   244                 default:
   245                   ShouldNotReachHere();
   246               }
   247             }
   248             if (buffer == NULL) {
   249               char utf8_buffer[4];
   250               char* next = (char*)utf8_write((u_char*)utf8_buffer, value);
   251               length += next - utf8_buffer;
   252             } else {
   253               char* next = (char*)utf8_write((u_char*)&buffer[length], value);
   254               length += next - &buffer[length];
   255             }
   256             break;
   257           }
   258           case 't': if (buffer != NULL) buffer[length] = '\t'; ptr += 2; length++; break;
   259           case 'n': if (buffer != NULL) buffer[length] = '\n'; ptr += 2; length++; break;
   260           case 'r': if (buffer != NULL) buffer[length] = '\r'; ptr += 2; length++; break;
   261           case 'f': if (buffer != NULL) buffer[length] = '\f'; ptr += 2; length++; break;
   262           default:
   263             ShouldNotReachHere();
   264         }
   265       }
   266     }
   267     if (round == 0) {
   268       buffer = NEW_RESOURCE_ARRAY(char, length + 1);
   269       ptr = quoted_ascii_str;
   270     } else {
   271       buffer[length] = '\0';
   272     }
   273   }
   274   return buffer;
   275 }
   278 // Returns NULL if 'c' it not found. This only works as long
   279 // as 'c' is an ASCII character
   280 const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) {
   281   assert(length >= 0, "sanity check");
   282   assert(c >= 0, "does not work for non-ASCII characters");
   283   // Skip backwards in string until 'c' is found or end is reached
   284   while(--length >= 0 && base[length] != c);
   285   return (length < 0) ? NULL : &base[length];
   286 }
   288 bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) {
   289   // Length must be the same
   290   if (length1 != length2) return false;
   291   for (int i = 0; i < length1; i++) {
   292     if (base1[i] != base2[i]) return false;
   293   }
   294   return true;
   295 }
   297 bool UTF8::is_supplementary_character(const unsigned char* str) {
   298   return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
   299       && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
   300 }
   302 jint UTF8::get_supplementary_character(const unsigned char* str) {
   303   return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
   304                  + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
   305 }
   308 //-------------------------------------------------------------------------------------
   311 int UNICODE::utf8_size(jchar c) {
   312   if ((0x0001 <= c) && (c <= 0x007F)) return 1;
   313   if (c <= 0x07FF) return 2;
   314   return 3;
   315 }
   317 int UNICODE::utf8_length(jchar* base, int length) {
   318   int result = 0;
   319   for (int index = 0; index < length; index++) {
   320     jchar c = base[index];
   321     if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
   322     else if (c <= 0x07FF) result += 2;
   323     else result += 3;
   324   }
   325   return result;
   326 }
   328 char* UNICODE::as_utf8(jchar* base, int length) {
   329   int utf8_len = utf8_length(base, length);
   330   u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
   331   u_char* p = result;
   332   for (int index = 0; index < length; index++) {
   333     p = utf8_write(p, base[index]);
   334   }
   335   *p = '\0';
   336   assert(p == &result[utf8_len], "length prediction must be correct");
   337   return (char*) result;
   338 }
   340 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
   341   u_char* p = (u_char*)buf;
   342   u_char* end = (u_char*)buf + buflen;
   343   for (int index = 0; index < length; index++) {
   344     jchar c = base[index];
   345     if (p + utf8_size(c) >= end) break;      // string is truncated
   346     p = utf8_write(p, base[index]);
   347   }
   348   *p = '\0';
   349   return buf;
   350 }
   352 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
   353   for(int index = 0; index < length; index++) {
   354     utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
   355   }
   356   *utf8_buffer = '\0';
   357 }
   359 // returns the quoted ascii length of a unicode string
   360 int UNICODE::quoted_ascii_length(jchar* base, int length) {
   361   int result = 0;
   362   for (int i = 0; i < length; i++) {
   363     jchar c = base[i];
   364     if (c >= 32 && c < 127) {
   365       result++;
   366     } else {
   367       result += 6;
   368     }
   369   }
   370   return result;
   371 }
   373 // converts a utf8 string to quoted ascii
   374 void UNICODE::as_quoted_ascii(const jchar* base, int length, char* buf, int buflen) {
   375   char* p = buf;
   376   char* end = buf + buflen;
   377   for (int index = 0; index < length; index++) {
   378     jchar c = base[index];
   379     if (c >= 32 && c < 127) {
   380       if (p + 1 >= end) break;      // string is truncated
   381       *p++ = (char)c;
   382     } else {
   383       if (p + 6 >= end) break;      // string is truncated
   384       sprintf(p, "\\u%04x", c);
   385       p += 6;
   386     }
   387   }
   388   *p = '\0';
   389 }

mercurial