src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 4411
e2e6bf86682c
parent 4410
00af3a3a8df4
child 4412
ffa87474d7a4
     1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jan 03 15:09:55 2013 -0800
     1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jan 03 16:30:47 2013 -0800
     1.3 @@ -6011,29 +6011,53 @@
     1.4      {
     1.5        assert( UseSSE >= 2, "supported cpu only" );
     1.6        Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
     1.7 -      // Fill 32-byte chunks
     1.8        movdl(xtmp, value);
     1.9 -      pshufd(xtmp, xtmp, 0);
    1.10 -
    1.11 -      subl(count, 8 << shift);
    1.12 -      jcc(Assembler::less, L_check_fill_8_bytes);
    1.13 -      align(16);
    1.14 -
    1.15 -      BIND(L_fill_32_bytes_loop);
    1.16 -
    1.17 -      if (UseUnalignedLoadStores) {
    1.18 -        movdqu(Address(to, 0), xtmp);
    1.19 -        movdqu(Address(to, 16), xtmp);
    1.20 +      if (UseAVX >= 2 && UseUnalignedLoadStores) {
    1.21 +        // Fill 64-byte chunks
    1.22 +        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
    1.23 +        vpbroadcastd(xtmp, xtmp);
    1.24 +
    1.25 +        subl(count, 16 << shift);
    1.26 +        jcc(Assembler::less, L_check_fill_32_bytes);
    1.27 +        align(16);
    1.28 +
    1.29 +        BIND(L_fill_64_bytes_loop);
    1.30 +        vmovdqu(Address(to, 0), xtmp);
    1.31 +        vmovdqu(Address(to, 32), xtmp);
    1.32 +        addptr(to, 64);
    1.33 +        subl(count, 16 << shift);
    1.34 +        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
    1.35 +
    1.36 +        BIND(L_check_fill_32_bytes);
    1.37 +        addl(count, 8 << shift);
    1.38 +        jccb(Assembler::less, L_check_fill_8_bytes);
    1.39 +        vmovdqu(Address(to, 0), xtmp);
    1.40 +        addptr(to, 32);
    1.41 +        subl(count, 8 << shift);
    1.42        } else {
    1.43 -        movq(Address(to, 0), xtmp);
    1.44 -        movq(Address(to, 8), xtmp);
    1.45 -        movq(Address(to, 16), xtmp);
    1.46 -        movq(Address(to, 24), xtmp);
    1.47 +        // Fill 32-byte chunks
    1.48 +        pshufd(xtmp, xtmp, 0);
    1.49 +
    1.50 +        subl(count, 8 << shift);
    1.51 +        jcc(Assembler::less, L_check_fill_8_bytes);
    1.52 +        align(16);
    1.53 +
    1.54 +        BIND(L_fill_32_bytes_loop);
    1.55 +
    1.56 +        if (UseUnalignedLoadStores) {
    1.57 +          movdqu(Address(to, 0), xtmp);
    1.58 +          movdqu(Address(to, 16), xtmp);
    1.59 +        } else {
    1.60 +          movq(Address(to, 0), xtmp);
    1.61 +          movq(Address(to, 8), xtmp);
    1.62 +          movq(Address(to, 16), xtmp);
    1.63 +          movq(Address(to, 24), xtmp);
    1.64 +        }
    1.65 +
    1.66 +        addptr(to, 32);
    1.67 +        subl(count, 8 << shift);
    1.68 +        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
    1.69        }
    1.70 -
    1.71 -      addptr(to, 32);
    1.72 -      subl(count, 8 << shift);
    1.73 -      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
    1.74        BIND(L_check_fill_8_bytes);
    1.75        addl(count, 8 << shift);
    1.76        jccb(Assembler::zero, L_exit);

mercurial