# HG changeset patch # User jiangshaofeng # Date 1474343301 -28800 # Node ID 89e1dfe996be2a1366f7a84a3f8dacf85dfa9ef2 # Parent 68d7c979cca6a711647bfe73afaf622383758ca5 #4537 Rewrite generate_disjoint_byte_copy Eliminated unaligned access and Optimized copy algorithm. The same as changeset 114 The unaligned account does not increase, has passed the SPECjvm2008 test. 20% speed up at the test program. The test program: public class ByteCopyTest{ public static void main(String args[]){ int count = 100000; char []A = new char[count]; char []B = new char[count]; for(int i = 0; i < count; i++){ A[i] = (char)(i % 26 + 97); } long startTime = System.nanoTime(); System.arraycopy(A, 0, B, 0, count); long endTime = System.nanoTime(); System.out.println(endTime - startTime); } } diff -r 68d7c979cca6 -r 89e1dfe996be src/cpu/mips/vm/stubGenerator_mips_64.cpp --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sun Sep 18 13:43:10 2016 +0800 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Tue Sep 20 11:48:21 2016 +0800 @@ -629,85 +629,176 @@ // disjoint_byte_copy_entry is set to the no-overlap entry point // used by generate_conjoint_byte_copy(). // - address generate_disjoint_byte_copy(bool aligned, const char *name) { - StubCodeMark mark(this, "StubRoutines", name); - __ align(CodeEntryAlignment); - address start = __ pc(); - Label l_0, l_1, l_2, l_3, l_4, l_5, l_6; - - __ push(T3); - __ push(T0); - __ push(T1); - __ push(T8); - __ move(T3, A0); - __ move(T0, A1); - __ move(T1, A2); - __ move(T8, T1); // original count in T1 - __ daddi(AT, T1, -3); - __ blez(AT, l_4); - __ delayed()->nop(); - if (!aligned) { - //TODO: copy 8 bytes at one time - // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */ - __ andi(AT, T3, 3); - __ andi(T9, T0, 3); - __ bne(AT, T9, l_5); - __ delayed()->nop(); - - // align source address at dword address boundary - __ move(T1, 4); - __ sub(T1, T1, T3); - __ andi(T1, T1, 3); - __ beq(T1, R0, l_1); - __ delayed()->nop(); - __ sub(T8,T8,T1); - __ bind(l_0); - __ lb(AT, T3, 0); - __ sb(AT, T0, 0); - __ addi(T3, T3, 1); - __ addi(T0, T0, 1); - __ addi(T1 ,T1, -1); - __ bne(T1, R0, l_0); - __ delayed()->nop(); - __ bind(l_1); - __ move(T1, T8); - } - __ shr(T1, 2); - __ beq(T1, R0, l_4); // no dwords to move - __ delayed()->nop(); - // copy aligned dwords - __ bind(l_2); - __ align(16); - __ bind(l_3); - __ lw(AT, T3, 0); - __ sw(AT, T0, 0 ); - __ addi(T3, T3, 4); - __ addi(T0, T0, 4); - __ addi(T1, T1, -1); - __ bne(T1, R0, l_3); - __ delayed()->nop(); - __ bind(l_4); - __ move(T1, T8); - __ andi(T1, T1, 3); - __ beq(T1, R0, l_6); - __ delayed()->nop(); - // copy suffix - __ bind(l_5); - __ lb(AT, T3, 0); - __ sb(AT, T0, 0); - __ addi(T3, T3, 1); - __ addi(T0, T0, 1); - __ addi(T1, T1, -1); - __ bne(T1, R0, l_5 ); - __ delayed()->nop(); - __ bind(l_6); - __ pop(T8); - __ pop(T1); - __ pop(T0); - __ pop(T3); - __ jr(RA); - __ delayed()->nop(); - return start; + address generate_disjoint_byte_copy(bool aligned, const char * name) { + StubCodeMark mark(this, "StubRoutines", name); + __ align(CodeEntryAlignment); + + + Register tmp1 = T0; + Register tmp2 = T1; + Register tmp3 = T3; + + address start = __ pc(); + + __ push(tmp1); + __ push(tmp2); + __ push(tmp3); + __ move(tmp1, A0); + __ move(tmp2, A1); + __ move(tmp3, A2); + + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11; + Label l_debug; + + __ daddi(AT, tmp3, -9); //why the number is 9 ? + __ blez(AT, l_9); + __ delayed()->nop(); + + if (!aligned) { + __ xorr(AT, tmp1, tmp2); + __ andi(AT, AT, 1); + __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy + __ delayed()->nop(); + + __ andi(AT, tmp1, 1); + __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes + __ delayed()->nop(); + + __ lb(AT, tmp1, 0); + __ daddi(tmp1, tmp1, 1); + __ sb(AT, tmp2, 0); + __ daddi(tmp2, tmp2, 1); + __ daddi(tmp3, tmp3, -1); + __ bind(l_10); + + __ xorr(AT, tmp1, tmp2); + __ andi(AT, AT, 3); + __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy + __ delayed()->nop(); + + // At this point it is guaranteed that both, from and to have the same alignment mod 4. + + // Copy 2 elements if necessary to align to 4 bytes. + __ andi(AT, tmp1, 3); + __ beq(AT, R0, l_2); + __ delayed()->nop(); + + __ lhu(AT, tmp1, 0); + __ daddi(tmp1, tmp1, 2); + __ sh(AT, tmp2, 0); + __ daddi(tmp2, tmp2, 2); + __ daddi(tmp3, tmp3, -2); + __ bind(l_2); + + // At this point the positions of both, from and to, are at least 4 byte aligned. + + // Copy 4 elements at a time. + // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. + __ xorr(AT, tmp1, tmp2); + __ andi(AT, AT, 7); + __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned + __ delayed()->nop(); + + // Copy a 4 elements if necessary to align to 8 bytes. + __ andi(AT, tmp1, 7); + __ beq(AT, R0, l_7); + __ delayed()->nop(); + + __ lw(AT, tmp1, 0); + __ daddi(tmp3, tmp3, -4); + __ sw(AT, tmp2, 0); + { // FasterArrayCopy + __ daddi(tmp1, tmp1, 4); + __ daddi(tmp2, tmp2, 4); + } + } + + __ bind(l_7); + + // Copy 4 elements at a time; either the loads or the stores can + // be unaligned if aligned == false. + + { // FasterArrayCopy + __ daddi(AT, tmp3, -7); + __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain + __ delayed()->nop(); + + __ bind(l_8); + // For Loongson, there is 128-bit memory access. TODO + __ ld(AT, tmp1, 0); + __ sd(AT, tmp2, 0); + __ daddi(tmp1, tmp1, 8); + __ daddi(tmp2, tmp2, 8); + __ daddi(tmp3, tmp3, -8); + __ daddi(AT, tmp3, -8); + __ bgez(AT, l_8); + __ delayed()->nop(); + } + __ bind(l_6); + + // copy 4 bytes at a time + { // FasterArrayCopy + __ daddi(AT, tmp3, -3); + __ blez(AT, l_1); + __ delayed()->nop(); + + __ bind(l_3); + __ lw(AT, tmp1, 0); + __ sw(AT, tmp2, 0); + __ daddi(tmp1, tmp1, 4); + __ daddi(tmp2, tmp2, 4); + __ daddi(tmp3, tmp3, -4); + __ daddi(AT, tmp3, -4); + __ bgez(AT, l_3); + __ delayed()->nop(); + + } + + // do 2 bytes copy + __ bind(l_1); + { + __ daddi(AT, tmp3, -1); + __ blez(AT, l_9); + __ delayed()->nop(); + + __ bind(l_5); + __ lhu(AT, tmp1, 0); + __ daddi(tmp3, tmp3, -2); + __ sh(AT, tmp2, 0); + __ daddi(tmp1, tmp1, 2); + __ daddi(tmp2, tmp2, 2); + __ daddi(AT, tmp3, -2); + __ bgez(AT, l_5); + __ delayed()->nop(); + } + + //do 1 element copy--byte + __ bind(l_9); + __ beq(R0, tmp3, l_4); + __ delayed()->nop(); + + { + __ bind(l_11); + __ lb(AT, tmp1, 0); + __ daddi(tmp3, tmp3, -1); + __ sb(AT, tmp2, 0); + __ daddi(tmp1, tmp1, 1); + __ daddi(tmp2, tmp2, 1); + __ daddi(AT, tmp3, -1); + __ bgez(AT, l_11); + __ delayed()->nop(); + } + + __ bind(l_4); + __ pop(tmp3); + __ pop(tmp2); + __ pop(tmp1); + + __ jr(RA); + __ delayed()->nop(); + + return start; } // Arguments: