diff -r ffa87474d7a4 -r 038dd2875b94 src/cpu/x86/vm/macroAssembler_x86.cpp --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Jan 07 14:08:28 2013 -0800 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Jan 08 11:30:51 2013 -0800 @@ -5675,42 +5675,114 @@ testl(cnt2, cnt2); jcc(Assembler::zero, LENGTH_DIFF_LABEL); - // Load first characters + // Compare first characters load_unsigned_short(result, Address(str1, 0)); load_unsigned_short(cnt1, Address(str2, 0)); - - // Compare first characters subl(result, cnt1); jcc(Assembler::notZero, POP_LABEL); - decrementl(cnt2); - jcc(Assembler::zero, LENGTH_DIFF_LABEL); - - { - // Check after comparing first character to see if strings are equivalent - Label LSkip2; - // Check if the strings start at same location - cmpptr(str1, str2); - jccb(Assembler::notEqual, LSkip2); - - // Check if the length difference is zero (from stack) - cmpl(Address(rsp, 0), 0x0); - jcc(Assembler::equal, LENGTH_DIFF_LABEL); - - // Strings might not be equivalent - bind(LSkip2); - } + cmpl(cnt2, 1); + jcc(Assembler::equal, LENGTH_DIFF_LABEL); + + // Check if the strings start at the same location. + cmpptr(str1, str2); + jcc(Assembler::equal, LENGTH_DIFF_LABEL); Address::ScaleFactor scale = Address::times_2; int stride = 8; - // Advance to next element - addptr(str1, 16/stride); - addptr(str2, 16/stride); - - if (UseSSE42Intrinsics) { + if (UseAVX >= 2) { + Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; + Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; + Label COMPARE_TAIL_LONG; + int pcmpmask = 0x19; + + // Setup to compare 16-chars (32-bytes) vectors, + // start from first character again because it has aligned address. + int stride2 = 16; + int adr_stride = stride << scale; + int adr_stride2 = stride2 << scale; + + assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); + // rax and rdx are used by pcmpestri as elements counters + movl(result, cnt2); + andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count + jcc(Assembler::zero, COMPARE_TAIL_LONG); + + // fast path : compare first 2 8-char vectors. + bind(COMPARE_16_CHARS); + movdqu(vec1, Address(str1, 0)); + pcmpestri(vec1, Address(str2, 0), pcmpmask); + jccb(Assembler::below, COMPARE_INDEX_CHAR); + + movdqu(vec1, Address(str1, adr_stride)); + pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); + jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); + addl(cnt1, stride); + + // Compare the characters at index in cnt1 + bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character + load_unsigned_short(result, Address(str1, cnt1, scale)); + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); + subl(result, cnt2); + jmp(POP_LABEL); + + // Setup the registers to start vector comparison loop + bind(COMPARE_WIDE_VECTORS); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + subl(result, stride2); + subl(cnt2, stride2); + jccb(Assembler::zero, COMPARE_WIDE_TAIL); + negptr(result); + + // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) + bind(COMPARE_WIDE_VECTORS_LOOP); + vmovdqu(vec1, Address(str1, result, scale)); + vpxor(vec1, Address(str2, result, scale)); + vptest(vec1, vec1); + jccb(Assembler::notZero, VECTOR_NOT_EQUAL); + addptr(result, stride2); + subl(cnt2, stride2); + jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); + + // compare wide vectors tail + bind(COMPARE_WIDE_TAIL); + testptr(result, result); + jccb(Assembler::zero, LENGTH_DIFF_LABEL); + + movl(result, stride2); + movl(cnt2, result); + negptr(result); + jmpb(COMPARE_WIDE_VECTORS_LOOP); + + // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. + bind(VECTOR_NOT_EQUAL); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + jmp(COMPARE_16_CHARS); + + // Compare tail chars, length between 1 to 15 chars + bind(COMPARE_TAIL_LONG); + movl(cnt2, result); + cmpl(cnt2, stride); + jccb(Assembler::less, COMPARE_SMALL_STR); + + movdqu(vec1, Address(str1, 0)); + pcmpestri(vec1, Address(str2, 0), pcmpmask); + jcc(Assembler::below, COMPARE_INDEX_CHAR); + subptr(cnt2, stride); + jccb(Assembler::zero, LENGTH_DIFF_LABEL); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + negptr(cnt2); + jmpb(WHILE_HEAD_LABEL); + + bind(COMPARE_SMALL_STR); + } else if (UseSSE42Intrinsics) { Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; int pcmpmask = 0x19; - // Setup to compare 16-byte vectors + // Setup to compare 8-char (16-byte) vectors, + // start from first character again because it has aligned address. movl(result, cnt2); andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count jccb(Assembler::zero, COMPARE_TAIL); @@ -5742,7 +5814,7 @@ jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); // compare wide vectors tail - testl(result, result); + testptr(result, result); jccb(Assembler::zero, LENGTH_DIFF_LABEL); movl(cnt2, stride); @@ -5754,21 +5826,20 @@ // Mismatched characters in the vectors bind(VECTOR_NOT_EQUAL); - addptr(result, cnt1); - movptr(cnt2, result); - load_unsigned_short(result, Address(str1, cnt2, scale)); - load_unsigned_short(cnt1, Address(str2, cnt2, scale)); - subl(result, cnt1); + addptr(cnt1, result); + load_unsigned_short(result, Address(str1, cnt1, scale)); + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); + subl(result, cnt2); jmpb(POP_LABEL); bind(COMPARE_TAIL); // limit is zero movl(cnt2, result); // Fallthru to tail compare } - // Shift str2 and str1 to the end of the arrays, negate min - lea(str1, Address(str1, cnt2, scale, 0)); - lea(str2, Address(str2, cnt2, scale, 0)); + lea(str1, Address(str1, cnt2, scale)); + lea(str2, Address(str2, cnt2, scale)); + decrementl(cnt2); // first character was compared already negptr(cnt2); // Compare the rest of the elements @@ -5833,7 +5904,44 @@ shll(limit, 1); // byte count != 0 movl(result, limit); // copy - if (UseSSE42Intrinsics) { + if (UseAVX >= 2) { + // With AVX2, use 32-byte vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 32-byte vectors + andl(result, 0x0000001e); // tail count (in bytes) + andl(limit, 0xffffffe0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, limit, Address::times_1)); + lea(ary2, Address(ary2, limit, Address::times_1)); + negptr(limit); + + bind(COMPARE_WIDE_VECTORS); + vmovdqu(vec1, Address(ary1, limit, Address::times_1)); + vmovdqu(vec2, Address(ary2, limit, Address::times_1)); + vpxor(vec1, vec2); + + vptest(vec1, vec1); + jccb(Assembler::notZero, FALSE_LABEL); + addptr(limit, 32); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + + testl(result, result); + jccb(Assembler::zero, TRUE_LABEL); + + vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); + vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); + vpxor(vec1, vec2); + + vptest(vec1, vec1); + jccb(Assembler::notZero, FALSE_LABEL); + jmpb(TRUE_LABEL); + + bind(COMPARE_TAIL); // limit is zero + movl(limit, result); + // Fallthru to tail compare + } else if (UseSSE42Intrinsics) { // With SSE4.2, use double quad vector compare Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;