Merge

Sat, 03 Sep 2011 09:56:57 -0700

author
never
date
Sat, 03 Sep 2011 09:56:57 -0700
changeset 3106
7ffacbb338d4
parent 3105
c26de9aef2ed
parent 3104
2090c623107e
child 3107
7b5c767f229c

Merge

     1.1 --- a/agent/src/share/classes/sun/jvm/hotspot/interpreter/BytecodeLoadConstant.java	Fri Sep 02 20:58:21 2011 -0700
     1.2 +++ b/agent/src/share/classes/sun/jvm/hotspot/interpreter/BytecodeLoadConstant.java	Sat Sep 03 09:56:57 2011 -0700
     1.3 @@ -90,7 +90,7 @@
     1.4             jcode == Bytecodes._ldc2_w;
     1.5      if (! codeOk) return false;
     1.6  
     1.7 -    ConstantTag ctag = method().getConstants().getTagAt(rawIndex());
     1.8 +    ConstantTag ctag = method().getConstants().getTagAt(poolIndex());
     1.9      if (jcode == Bytecodes._ldc2_w) {
    1.10         // has to be double or long
    1.11         return (ctag.isDouble() || ctag.isLong()) ? true: false;
     2.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Sep 02 20:58:21 2011 -0700
     2.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Sat Sep 03 09:56:57 2011 -0700
     2.3 @@ -1124,6 +1124,126 @@
     2.4      }
     2.5    }
     2.6  
     2.7 +  //
     2.8 +  // Generate main code for disjoint arraycopy
     2.9 +  //
    2.10 +  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
    2.11 +                                              Label& L_loop, bool use_prefetch, bool use_bis);
    2.12 +
    2.13 +  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
    2.14 +                          int iter_size, CopyLoopFunc copy_loop_func) {
    2.15 +    Label L_copy;
    2.16 +
    2.17 +    assert(log2_elem_size <= 3, "the following code should be changed");
    2.18 +    int count_dec = 16>>log2_elem_size;
    2.19 +
    2.20 +    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
    2.21 +    assert(prefetch_dist < 4096, "invalid value");
    2.22 +    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
    2.23 +    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
    2.24 +
    2.25 +    if (UseBlockCopy) {
    2.26 +      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
    2.27 +
    2.28 +      // 64 bytes tail + bytes copied in one loop iteration
    2.29 +      int tail_size = 64 + iter_size;
    2.30 +      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
    2.31 +      // Use BIS copy only for big arrays since it requires membar.
    2.32 +      __ set(block_copy_count, O4);
    2.33 +      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
    2.34 +      // This code is for disjoint source and destination:
    2.35 +      //   to <= from || to >= from+count
    2.36 +      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
    2.37 +      __ sub(from, to, O4);
    2.38 +      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
    2.39 +      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
    2.40 +
    2.41 +      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
    2.42 +      // BIS should not be used to copy tail (64 bytes+iter_size)
    2.43 +      // to avoid zeroing of following values.
    2.44 +      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
    2.45 +
    2.46 +      if (prefetch_count > 0) { // rounded up to one iteration count
    2.47 +        // Do prefetching only if copy size is bigger
    2.48 +        // than prefetch distance.
    2.49 +        __ set(prefetch_count, O4);
    2.50 +        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
    2.51 +        __ sub(count, prefetch_count, count);
    2.52 +
    2.53 +        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
    2.54 +        __ add(count, prefetch_count, count); // restore count
    2.55 +
    2.56 +      } // prefetch_count > 0
    2.57 +
    2.58 +      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
    2.59 +      __ add(count, (tail_size>>log2_elem_size), count); // restore count
    2.60 +
    2.61 +      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
    2.62 +      // BIS needs membar.
    2.63 +      __ membar(Assembler::StoreLoad);
    2.64 +      // Copy tail
    2.65 +      __ ba_short(L_copy);
    2.66 +
    2.67 +      __ BIND(L_skip_block_copy);
    2.68 +    } // UseBlockCopy
    2.69 +
    2.70 +    if (prefetch_count > 0) { // rounded up to one iteration count
    2.71 +      // Do prefetching only if copy size is bigger
    2.72 +      // than prefetch distance.
    2.73 +      __ set(prefetch_count, O4);
    2.74 +      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
    2.75 +      __ sub(count, prefetch_count, count);
    2.76 +
    2.77 +      Label L_copy_prefetch;
    2.78 +      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
    2.79 +      __ add(count, prefetch_count, count); // restore count
    2.80 +
    2.81 +    } // prefetch_count > 0
    2.82 +
    2.83 +    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
    2.84 +  }
    2.85 +
    2.86 +
    2.87 +
    2.88 +  //
    2.89 +  // Helper methods for copy_16_bytes_forward_with_shift()
    2.90 +  //
    2.91 +  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
    2.92 +                                Label& L_loop, bool use_prefetch, bool use_bis) {
    2.93 +
    2.94 +    const Register left_shift  = G1; // left  shift bit counter
    2.95 +    const Register right_shift = G5; // right shift bit counter
    2.96 +
    2.97 +    __ align(OptoLoopAlignment);
    2.98 +    __ BIND(L_loop);
    2.99 +    if (use_prefetch) {
   2.100 +      if (ArraycopySrcPrefetchDistance > 0) {
   2.101 +        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
   2.102 +      }
   2.103 +      if (ArraycopyDstPrefetchDistance > 0) {
   2.104 +        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
   2.105 +      }
   2.106 +    }
   2.107 +    __ ldx(from, 0, O4);
   2.108 +    __ ldx(from, 8, G4);
   2.109 +    __ inc(to, 16);
   2.110 +    __ inc(from, 16);
   2.111 +    __ deccc(count, count_dec); // Can we do next iteration after this one?
   2.112 +    __ srlx(O4, right_shift, G3);
   2.113 +    __ bset(G3, O3);
   2.114 +    __ sllx(O4, left_shift,  O4);
   2.115 +    __ srlx(G4, right_shift, G3);
   2.116 +    __ bset(G3, O4);
   2.117 +    if (use_bis) {
   2.118 +      __ stxa(O3, to, -16);
   2.119 +      __ stxa(O4, to, -8);
   2.120 +    } else {
   2.121 +      __ stx(O3, to, -16);
   2.122 +      __ stx(O4, to, -8);
   2.123 +    }
   2.124 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
   2.125 +    __ delayed()->sllx(G4, left_shift,  O3);
   2.126 +  }
   2.127  
   2.128    // Copy big chunks forward with shift
   2.129    //
   2.130 @@ -1135,64 +1255,51 @@
   2.131    //   L_copy_bytes - copy exit label
   2.132    //
   2.133    void copy_16_bytes_forward_with_shift(Register from, Register to,
   2.134 -                     Register count, int count_dec, Label& L_copy_bytes) {
   2.135 -    Label L_loop, L_aligned_copy, L_copy_last_bytes;
   2.136 +                     Register count, int log2_elem_size, Label& L_copy_bytes) {
   2.137 +    Label L_aligned_copy, L_copy_last_bytes;
   2.138 +    assert(log2_elem_size <= 3, "the following code should be changed");
   2.139 +    int count_dec = 16>>log2_elem_size;
   2.140  
   2.141      // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
   2.142 -      __ andcc(from, 7, G1); // misaligned bytes
   2.143 -      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
   2.144 -      __ delayed()->nop();
   2.145 +    __ andcc(from, 7, G1); // misaligned bytes
   2.146 +    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
   2.147 +    __ delayed()->nop();
   2.148  
   2.149      const Register left_shift  = G1; // left  shift bit counter
   2.150      const Register right_shift = G5; // right shift bit counter
   2.151  
   2.152 -      __ sll(G1, LogBitsPerByte, left_shift);
   2.153 -      __ mov(64, right_shift);
   2.154 -      __ sub(right_shift, left_shift, right_shift);
   2.155 +    __ sll(G1, LogBitsPerByte, left_shift);
   2.156 +    __ mov(64, right_shift);
   2.157 +    __ sub(right_shift, left_shift, right_shift);
   2.158  
   2.159      //
   2.160      // Load 2 aligned 8-bytes chunks and use one from previous iteration
   2.161      // to form 2 aligned 8-bytes chunks to store.
   2.162      //
   2.163 -      __ deccc(count, count_dec); // Pre-decrement 'count'
   2.164 -      __ andn(from, 7, from);     // Align address
   2.165 -      __ ldx(from, 0, O3);
   2.166 -      __ inc(from, 8);
   2.167 -      __ align(OptoLoopAlignment);
   2.168 -    __ BIND(L_loop);
   2.169 -      __ ldx(from, 0, O4);
   2.170 -      __ deccc(count, count_dec); // Can we do next iteration after this one?
   2.171 -      __ ldx(from, 8, G4);
   2.172 -      __ inc(to, 16);
   2.173 -      __ inc(from, 16);
   2.174 -      __ sllx(O3, left_shift,  O3);
   2.175 -      __ srlx(O4, right_shift, G3);
   2.176 -      __ bset(G3, O3);
   2.177 -      __ stx(O3, to, -16);
   2.178 -      __ sllx(O4, left_shift,  O4);
   2.179 -      __ srlx(G4, right_shift, G3);
   2.180 -      __ bset(G3, O4);
   2.181 -      __ stx(O4, to, -8);
   2.182 -      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
   2.183 -      __ delayed()->mov(G4, O3);
   2.184 -
   2.185 -      __ inccc(count, count_dec>>1 ); // + 8 bytes
   2.186 -      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
   2.187 -      __ delayed()->inc(count, count_dec>>1); // restore 'count'
   2.188 -
   2.189 -      // copy 8 bytes, part of them already loaded in O3
   2.190 -      __ ldx(from, 0, O4);
   2.191 -      __ inc(to, 8);
   2.192 -      __ inc(from, 8);
   2.193 -      __ sllx(O3, left_shift,  O3);
   2.194 -      __ srlx(O4, right_shift, G3);
   2.195 -      __ bset(O3, G3);
   2.196 -      __ stx(G3, to, -8);
   2.197 +    __ dec(count, count_dec);   // Pre-decrement 'count'
   2.198 +    __ andn(from, 7, from);     // Align address
   2.199 +    __ ldx(from, 0, O3);
   2.200 +    __ inc(from, 8);
   2.201 +    __ sllx(O3, left_shift,  O3);
   2.202 +
   2.203 +    disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
   2.204 +
   2.205 +    __ inccc(count, count_dec>>1 ); // + 8 bytes
   2.206 +    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
   2.207 +    __ delayed()->inc(count, count_dec>>1); // restore 'count'
   2.208 +
   2.209 +    // copy 8 bytes, part of them already loaded in O3
   2.210 +    __ ldx(from, 0, O4);
   2.211 +    __ inc(to, 8);
   2.212 +    __ inc(from, 8);
   2.213 +    __ srlx(O4, right_shift, G3);
   2.214 +    __ bset(O3, G3);
   2.215 +    __ stx(G3, to, -8);
   2.216  
   2.217      __ BIND(L_copy_last_bytes);
   2.218 -      __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
   2.219 -      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
   2.220 -      __ delayed()->sub(from, right_shift, from);       // restore address
   2.221 +    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
   2.222 +    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
   2.223 +    __ delayed()->sub(from, right_shift, from);       // restore address
   2.224  
   2.225      __ BIND(L_aligned_copy);
   2.226    }
   2.227 @@ -1348,7 +1455,7 @@
   2.228        // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
   2.229        // Also jump over aligned copy after the copy with shift completed.
   2.230  
   2.231 -      copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
   2.232 +      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
   2.233      }
   2.234  
   2.235      // Both array are 8 bytes aligned, copy 16 bytes at a time
   2.236 @@ -1576,7 +1683,7 @@
   2.237        // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
   2.238        // Also jump over aligned copy after the copy with shift completed.
   2.239  
   2.240 -      copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
   2.241 +      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
   2.242      }
   2.243  
   2.244      // Both array are 8 bytes aligned, copy 16 bytes at a time
   2.245 @@ -1950,6 +2057,45 @@
   2.246    }
   2.247  
   2.248    //
   2.249 +  // Helper methods for generate_disjoint_int_copy_core()
   2.250 +  //
   2.251 +  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
   2.252 +                          Label& L_loop, bool use_prefetch, bool use_bis) {
   2.253 +
   2.254 +    __ align(OptoLoopAlignment);
   2.255 +    __ BIND(L_loop);
   2.256 +    if (use_prefetch) {
   2.257 +      if (ArraycopySrcPrefetchDistance > 0) {
   2.258 +        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
   2.259 +      }
   2.260 +      if (ArraycopyDstPrefetchDistance > 0) {
   2.261 +        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
   2.262 +      }
   2.263 +    }
   2.264 +    __ ldx(from, 4, O4);
   2.265 +    __ ldx(from, 12, G4);
   2.266 +    __ inc(to, 16);
   2.267 +    __ inc(from, 16);
   2.268 +    __ deccc(count, 4); // Can we do next iteration after this one?
   2.269 +
   2.270 +    __ srlx(O4, 32, G3);
   2.271 +    __ bset(G3, O3);
   2.272 +    __ sllx(O4, 32, O4);
   2.273 +    __ srlx(G4, 32, G3);
   2.274 +    __ bset(G3, O4);
   2.275 +    if (use_bis) {
   2.276 +      __ stxa(O3, to, -16);
   2.277 +      __ stxa(O4, to, -8);
   2.278 +    } else {
   2.279 +      __ stx(O3, to, -16);
   2.280 +      __ stx(O4, to, -8);
   2.281 +    }
   2.282 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
   2.283 +    __ delayed()->sllx(G4, 32,  O3);
   2.284 +
   2.285 +  }
   2.286 +
   2.287 +  //
   2.288    //  Generate core code for disjoint int copy (and oop copy on 32-bit).
   2.289    //  If "aligned" is true, the "from" and "to" addresses are assumed
   2.290    //  to be heapword aligned.
   2.291 @@ -1962,7 +2108,7 @@
   2.292    void generate_disjoint_int_copy_core(bool aligned) {
   2.293  
   2.294      Label L_skip_alignment, L_aligned_copy;
   2.295 -    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
   2.296 +    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
   2.297  
   2.298      const Register from      = O0;   // source array address
   2.299      const Register to        = O1;   // destination array address
   2.300 @@ -2013,30 +2159,16 @@
   2.301  
   2.302      // copy with shift 4 elements (16 bytes) at a time
   2.303        __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
   2.304 -
   2.305 -      __ align(OptoLoopAlignment);
   2.306 -    __ BIND(L_copy_16_bytes);
   2.307 -      __ ldx(from, 4, O4);
   2.308 -      __ deccc(count, 4); // Can we do next iteration after this one?
   2.309 -      __ ldx(from, 12, G4);
   2.310 -      __ inc(to, 16);
   2.311 -      __ inc(from, 16);
   2.312 -      __ sllx(O3, 32, O3);
   2.313 -      __ srlx(O4, 32, G3);
   2.314 -      __ bset(G3, O3);
   2.315 -      __ stx(O3, to, -16);
   2.316 -      __ sllx(O4, 32, O4);
   2.317 -      __ srlx(G4, 32, G3);
   2.318 -      __ bset(G3, O4);
   2.319 -      __ stx(O4, to, -8);
   2.320 -      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
   2.321 -      __ delayed()->mov(G4, O3);
   2.322 +      __ sllx(O3, 32,  O3);
   2.323 +
   2.324 +      disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
   2.325  
   2.326        __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
   2.327        __ delayed()->inc(count, 4); // restore 'count'
   2.328  
   2.329      __ BIND(L_aligned_copy);
   2.330 -    }
   2.331 +    } // !aligned
   2.332 +
   2.333      // copy 4 elements (16 bytes) at a time
   2.334        __ and3(count, 1, G4); // Save
   2.335        __ srl(count, 1, count);
   2.336 @@ -2223,6 +2355,38 @@
   2.337    }
   2.338  
   2.339    //
   2.340 +  // Helper methods for generate_disjoint_long_copy_core()
   2.341 +  //
   2.342 +  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
   2.343 +                          Label& L_loop, bool use_prefetch, bool use_bis) {
   2.344 +    __ align(OptoLoopAlignment);
   2.345 +    __ BIND(L_loop);
   2.346 +    for (int off = 0; off < 64; off += 16) {
   2.347 +      if (use_prefetch && (off & 31) == 0) {
   2.348 +        if (ArraycopySrcPrefetchDistance > 0) {
   2.349 +          __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
   2.350 +        }
   2.351 +        if (ArraycopyDstPrefetchDistance > 0) {
   2.352 +          __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
   2.353 +        }
   2.354 +      }
   2.355 +      __ ldx(from,  off+0, O4);
   2.356 +      __ ldx(from,  off+8, O5);
   2.357 +      if (use_bis) {
   2.358 +        __ stxa(O4, to,  off+0);
   2.359 +        __ stxa(O5, to,  off+8);
   2.360 +      } else {
   2.361 +        __ stx(O4, to,  off+0);
   2.362 +        __ stx(O5, to,  off+8);
   2.363 +      }
   2.364 +    }
   2.365 +    __ deccc(count, 8);
   2.366 +    __ inc(from, 64);
   2.367 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
   2.368 +    __ delayed()->inc(to, 64);
   2.369 +  }
   2.370 +
   2.371 +  //
   2.372    //  Generate core code for disjoint long copy (and oop copy on 64-bit).
   2.373    //  "aligned" is ignored, because we must make the stronger
   2.374    //  assumption that both addresses are always 64-bit aligned.
   2.375 @@ -2261,38 +2425,28 @@
   2.376      const Register offset0 = O4;  // element offset
   2.377      const Register offset8 = O5;  // next element offset
   2.378  
   2.379 -      __ deccc(count, 2);
   2.380 -      __ mov(G0, offset0);   // offset from start of arrays (0)
   2.381 -      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
   2.382 -      __ delayed()->add(offset0, 8, offset8);
   2.383 +    __ deccc(count, 2);
   2.384 +    __ mov(G0, offset0);   // offset from start of arrays (0)
   2.385 +    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
   2.386 +    __ delayed()->add(offset0, 8, offset8);
   2.387  
   2.388      // Copy by 64 bytes chunks
   2.389 -    Label L_copy_64_bytes;
   2.390 +
   2.391      const Register from64 = O3;  // source address
   2.392      const Register to64   = G3;  // destination address
   2.393 -      __ subcc(count, 6, O3);
   2.394 -      __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
   2.395 -      __ delayed()->mov(to,   to64);
   2.396 -      // Now we can use O4(offset0), O5(offset8) as temps
   2.397 -      __ mov(O3, count);
   2.398 -      __ mov(from, from64);
   2.399 -
   2.400 -      __ align(OptoLoopAlignment);
   2.401 -    __ BIND(L_copy_64_bytes);
   2.402 -      for( int off = 0; off < 64; off += 16 ) {
   2.403 -        __ ldx(from64,  off+0, O4);
   2.404 -        __ ldx(from64,  off+8, O5);
   2.405 -        __ stx(O4, to64,  off+0);
   2.406 -        __ stx(O5, to64,  off+8);
   2.407 -      }
   2.408 -      __ deccc(count, 8);
   2.409 -      __ inc(from64, 64);
   2.410 -      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
   2.411 -      __ delayed()->inc(to64, 64);
   2.412 +    __ subcc(count, 6, O3);
   2.413 +    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
   2.414 +    __ delayed()->mov(to,   to64);
   2.415 +    // Now we can use O4(offset0), O5(offset8) as temps
   2.416 +    __ mov(O3, count);
   2.417 +    // count >= 0 (original count - 8)
   2.418 +    __ mov(from, from64);
   2.419 +
   2.420 +    disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
   2.421  
   2.422        // Restore O4(offset0), O5(offset8)
   2.423        __ sub(from64, from, offset0);
   2.424 -      __ inccc(count, 6);
   2.425 +      __ inccc(count, 6); // restore count
   2.426        __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
   2.427        __ delayed()->add(offset0, 8, offset8);
   2.428  
     3.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Sep 02 20:58:21 2011 -0700
     3.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Sat Sep 03 09:56:57 2011 -0700
     3.3 @@ -75,6 +75,24 @@
     3.4      FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1);
     3.5    }
     3.6  
     3.7 +  if (has_v9()) {
     3.8 +    assert(ArraycopySrcPrefetchDistance < 4096, "invalid value");
     3.9 +    if (ArraycopySrcPrefetchDistance >= 4096)
    3.10 +      ArraycopySrcPrefetchDistance = 4064;
    3.11 +    assert(ArraycopyDstPrefetchDistance < 4096, "invalid value");
    3.12 +    if (ArraycopyDstPrefetchDistance >= 4096)
    3.13 +      ArraycopyDstPrefetchDistance = 4064;
    3.14 +  } else {
    3.15 +    if (ArraycopySrcPrefetchDistance > 0) {
    3.16 +      warning("prefetch instructions are not available on this CPU");
    3.17 +      FLAG_SET_DEFAULT(ArraycopySrcPrefetchDistance, 0);
    3.18 +    }
    3.19 +    if (ArraycopyDstPrefetchDistance > 0) {
    3.20 +      warning("prefetch instructions are not available on this CPU");
    3.21 +      FLAG_SET_DEFAULT(ArraycopyDstPrefetchDistance, 0);
    3.22 +    }
    3.23 +  }
    3.24 +
    3.25    UseSSE = 0; // Only on x86 and x64
    3.26  
    3.27    _supports_cx8 = has_v9();
    3.28 @@ -180,6 +198,16 @@
    3.29      FLAG_SET_DEFAULT(UseBlockZeroing, false);
    3.30    }
    3.31  
    3.32 +  assert(BlockCopyLowLimit > 0, "invalid value");
    3.33 +  if (has_block_zeroing()) { // has_blk_init() && is_T4(): core's local L2 cache
    3.34 +    if (FLAG_IS_DEFAULT(UseBlockCopy)) {
    3.35 +      FLAG_SET_DEFAULT(UseBlockCopy, true);
    3.36 +    }
    3.37 +  } else if (UseBlockCopy) {
    3.38 +    warning("BIS instructions are not available or expensive on this CPU");
    3.39 +    FLAG_SET_DEFAULT(UseBlockCopy, false);
    3.40 +  }
    3.41 +
    3.42  #ifdef COMPILER2
    3.43    // T4 and newer Sparc cpus have fast RDPC.
    3.44    if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
     4.1 --- a/src/share/vm/runtime/globals.hpp	Fri Sep 02 20:58:21 2011 -0700
     4.2 +++ b/src/share/vm/runtime/globals.hpp	Sat Sep 03 09:56:57 2011 -0700
     4.3 @@ -1985,6 +1985,12 @@
     4.4    product(intx, BlockZeroingLowLimit, 2048,                                 \
     4.5            "Minimum size in bytes when block zeroing will be used")          \
     4.6                                                                              \
     4.7 +  product(bool, UseBlockCopy, false,                                        \
     4.8 +          "Use special cpu instructions for block copy")                    \
     4.9 +                                                                            \
    4.10 +  product(intx, BlockCopyLowLimit, 2048,                                    \
    4.11 +          "Minimum size in bytes when block copy will be used")             \
    4.12 +                                                                            \
    4.13    product(bool, PrintRevisitStats, false,                                   \
    4.14            "Print revisit (klass and MDO) stack related information")        \
    4.15                                                                              \
    4.16 @@ -2918,6 +2924,12 @@
    4.17    product(intx,  ReadPrefetchInstr, 0,                                      \
    4.18            "Prefetch instruction to prefetch ahead")                         \
    4.19                                                                              \
    4.20 +  product(uintx,  ArraycopySrcPrefetchDistance, 0,                          \
    4.21 +          "Distance to prefetch source array in arracopy")                  \
    4.22 +                                                                            \
    4.23 +  product(uintx,  ArraycopyDstPrefetchDistance, 0,                          \
    4.24 +          "Distance to prefetch destination array in arracopy")             \
    4.25 +                                                                            \
    4.26    /* deoptimization */                                                      \
    4.27    develop(bool, TraceDeoptimization, false,                                 \
    4.28            "Trace deoptimization")                                           \

mercurial