1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Jan 03 15:09:55 2013 -0800 1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Jan 03 16:30:47 2013 -0800 1.3 @@ -6011,29 +6011,53 @@ 1.4 { 1.5 assert( UseSSE >= 2, "supported cpu only" ); 1.6 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 1.7 - // Fill 32-byte chunks 1.8 movdl(xtmp, value); 1.9 - pshufd(xtmp, xtmp, 0); 1.10 - 1.11 - subl(count, 8 << shift); 1.12 - jcc(Assembler::less, L_check_fill_8_bytes); 1.13 - align(16); 1.14 - 1.15 - BIND(L_fill_32_bytes_loop); 1.16 - 1.17 - if (UseUnalignedLoadStores) { 1.18 - movdqu(Address(to, 0), xtmp); 1.19 - movdqu(Address(to, 16), xtmp); 1.20 + if (UseAVX >= 2 && UseUnalignedLoadStores) { 1.21 + // Fill 64-byte chunks 1.22 + Label L_fill_64_bytes_loop, L_check_fill_32_bytes; 1.23 + vpbroadcastd(xtmp, xtmp); 1.24 + 1.25 + subl(count, 16 << shift); 1.26 + jcc(Assembler::less, L_check_fill_32_bytes); 1.27 + align(16); 1.28 + 1.29 + BIND(L_fill_64_bytes_loop); 1.30 + vmovdqu(Address(to, 0), xtmp); 1.31 + vmovdqu(Address(to, 32), xtmp); 1.32 + addptr(to, 64); 1.33 + subl(count, 16 << shift); 1.34 + jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 1.35 + 1.36 + BIND(L_check_fill_32_bytes); 1.37 + addl(count, 8 << shift); 1.38 + jccb(Assembler::less, L_check_fill_8_bytes); 1.39 + vmovdqu(Address(to, 0), xtmp); 1.40 + addptr(to, 32); 1.41 + subl(count, 8 << shift); 1.42 } else { 1.43 - movq(Address(to, 0), xtmp); 1.44 - movq(Address(to, 8), xtmp); 1.45 - movq(Address(to, 16), xtmp); 1.46 - movq(Address(to, 24), xtmp); 1.47 + // Fill 32-byte chunks 1.48 + pshufd(xtmp, xtmp, 0); 1.49 + 1.50 + subl(count, 8 << shift); 1.51 + jcc(Assembler::less, L_check_fill_8_bytes); 1.52 + align(16); 1.53 + 1.54 + BIND(L_fill_32_bytes_loop); 1.55 + 1.56 + if (UseUnalignedLoadStores) { 1.57 + movdqu(Address(to, 0), xtmp); 1.58 + movdqu(Address(to, 16), xtmp); 1.59 + } else { 1.60 + movq(Address(to, 0), xtmp); 1.61 + movq(Address(to, 8), xtmp); 1.62 + movq(Address(to, 16), xtmp); 1.63 + movq(Address(to, 24), xtmp); 1.64 + } 1.65 + 1.66 + addptr(to, 32); 1.67 + subl(count, 8 << shift); 1.68 + jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 1.69 } 1.70 - 1.71 - addptr(to, 32); 1.72 - subl(count, 8 << shift); 1.73 - jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 1.74 BIND(L_check_fill_8_bytes); 1.75 addl(count, 8 << shift); 1.76 jccb(Assembler::zero, L_exit);