6386 bind(L_copy_1_char_exit); |
6395 bind(L_copy_1_char_exit); |
6387 addptr(result, len); // len is negative count of not processed elements |
6396 addptr(result, len); // len is negative count of not processed elements |
6388 bind(L_done); |
6397 bind(L_done); |
6389 } |
6398 } |
6390 |
6399 |
|
6400 /** |
|
6401 * Emits code to update CRC-32 with a byte value according to constants in table |
|
6402 * |
|
6403 * @param [in,out]crc Register containing the crc. |
|
6404 * @param [in]val Register containing the byte to fold into the CRC. |
|
6405 * @param [in]table Register containing the table of crc constants. |
|
6406 * |
|
6407 * uint32_t crc; |
|
6408 * val = crc_table[(val ^ crc) & 0xFF]; |
|
6409 * crc = val ^ (crc >> 8); |
|
6410 * |
|
6411 */ |
|
6412 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { |
|
6413 xorl(val, crc); |
|
6414 andl(val, 0xFF); |
|
6415 shrl(crc, 8); // unsigned shift |
|
6416 xorl(crc, Address(table, val, Address::times_4, 0)); |
|
6417 } |
|
6418 |
|
6419 /** |
|
6420 * Fold 128-bit data chunk |
|
6421 */ |
|
6422 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { |
|
6423 vpclmulhdq(xtmp, xK, xcrc); // [123:64] |
|
6424 vpclmulldq(xcrc, xK, xcrc); // [63:0] |
|
6425 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); |
|
6426 pxor(xcrc, xtmp); |
|
6427 } |
|
6428 |
|
6429 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { |
|
6430 vpclmulhdq(xtmp, xK, xcrc); |
|
6431 vpclmulldq(xcrc, xK, xcrc); |
|
6432 pxor(xcrc, xbuf); |
|
6433 pxor(xcrc, xtmp); |
|
6434 } |
|
6435 |
|
6436 /** |
|
6437 * 8-bit folds to compute 32-bit CRC |
|
6438 * |
|
6439 * uint64_t xcrc; |
|
6440 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); |
|
6441 */ |
|
6442 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { |
|
6443 movdl(tmp, xcrc); |
|
6444 andl(tmp, 0xFF); |
|
6445 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); |
|
6446 psrldq(xcrc, 1); // unsigned shift one byte |
|
6447 pxor(xcrc, xtmp); |
|
6448 } |
|
6449 |
|
6450 /** |
|
6451 * uint32_t crc; |
|
6452 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); |
|
6453 */ |
|
6454 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { |
|
6455 movl(tmp, crc); |
|
6456 andl(tmp, 0xFF); |
|
6457 shrl(crc, 8); |
|
6458 xorl(crc, Address(table, tmp, Address::times_4, 0)); |
|
6459 } |
|
6460 |
|
6461 /** |
|
6462 * @param crc register containing existing CRC (32-bit) |
|
6463 * @param buf register pointing to input byte buffer (byte*) |
|
6464 * @param len register containing number of bytes |
|
6465 * @param table register that will contain address of CRC table |
|
6466 * @param tmp scratch register |
|
6467 */ |
|
6468 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { |
|
6469 assert_different_registers(crc, buf, len, table, tmp, rax); |
|
6470 |
|
6471 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; |
|
6472 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; |
|
6473 |
|
6474 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); |
|
6475 notl(crc); // ~crc |
|
6476 cmpl(len, 16); |
|
6477 jcc(Assembler::less, L_tail); |
|
6478 |
|
6479 // Align buffer to 16 bytes |
|
6480 movl(tmp, buf); |
|
6481 andl(tmp, 0xF); |
|
6482 jccb(Assembler::zero, L_aligned); |
|
6483 subl(tmp, 16); |
|
6484 addl(len, tmp); |
|
6485 |
|
6486 align(4); |
|
6487 BIND(L_align_loop); |
|
6488 movsbl(rax, Address(buf, 0)); // load byte with sign extension |
|
6489 update_byte_crc32(crc, rax, table); |
|
6490 increment(buf); |
|
6491 incrementl(tmp); |
|
6492 jccb(Assembler::less, L_align_loop); |
|
6493 |
|
6494 BIND(L_aligned); |
|
6495 movl(tmp, len); // save |
|
6496 shrl(len, 4); |
|
6497 jcc(Assembler::zero, L_tail_restore); |
|
6498 |
|
6499 // Fold crc into first bytes of vector |
|
6500 movdqa(xmm1, Address(buf, 0)); |
|
6501 movdl(rax, xmm1); |
|
6502 xorl(crc, rax); |
|
6503 pinsrd(xmm1, crc, 0); |
|
6504 addptr(buf, 16); |
|
6505 subl(len, 4); // len > 0 |
|
6506 jcc(Assembler::less, L_fold_tail); |
|
6507 |
|
6508 movdqa(xmm2, Address(buf, 0)); |
|
6509 movdqa(xmm3, Address(buf, 16)); |
|
6510 movdqa(xmm4, Address(buf, 32)); |
|
6511 addptr(buf, 48); |
|
6512 subl(len, 3); |
|
6513 jcc(Assembler::lessEqual, L_fold_512b); |
|
6514 |
|
6515 // Fold total 512 bits of polynomial on each iteration, |
|
6516 // 128 bits per each of 4 parallel streams. |
|
6517 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); |
|
6518 |
|
6519 align(32); |
|
6520 BIND(L_fold_512b_loop); |
|
6521 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); |
|
6522 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); |
|
6523 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); |
|
6524 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); |
|
6525 addptr(buf, 64); |
|
6526 subl(len, 4); |
|
6527 jcc(Assembler::greater, L_fold_512b_loop); |
|
6528 |
|
6529 // Fold 512 bits to 128 bits. |
|
6530 BIND(L_fold_512b); |
|
6531 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); |
|
6532 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); |
|
6533 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); |
|
6534 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); |
|
6535 |
|
6536 // Fold the rest of 128 bits data chunks |
|
6537 BIND(L_fold_tail); |
|
6538 addl(len, 3); |
|
6539 jccb(Assembler::lessEqual, L_fold_128b); |
|
6540 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); |
|
6541 |
|
6542 BIND(L_fold_tail_loop); |
|
6543 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); |
|
6544 addptr(buf, 16); |
|
6545 decrementl(len); |
|
6546 jccb(Assembler::greater, L_fold_tail_loop); |
|
6547 |
|
6548 // Fold 128 bits in xmm1 down into 32 bits in crc register. |
|
6549 BIND(L_fold_128b); |
|
6550 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); |
|
6551 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); |
|
6552 vpand(xmm3, xmm0, xmm2, false /* vector256 */); |
|
6553 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); |
|
6554 psrldq(xmm1, 8); |
|
6555 psrldq(xmm2, 4); |
|
6556 pxor(xmm0, xmm1); |
|
6557 pxor(xmm0, xmm2); |
|
6558 |
|
6559 // 8 8-bit folds to compute 32-bit CRC. |
|
6560 for (int j = 0; j < 4; j++) { |
|
6561 fold_8bit_crc32(xmm0, table, xmm1, rax); |
|
6562 } |
|
6563 movdl(crc, xmm0); // mov 32 bits to general register |
|
6564 for (int j = 0; j < 4; j++) { |
|
6565 fold_8bit_crc32(crc, table, rax); |
|
6566 } |
|
6567 |
|
6568 BIND(L_tail_restore); |
|
6569 movl(len, tmp); // restore |
|
6570 BIND(L_tail); |
|
6571 andl(len, 0xf); |
|
6572 jccb(Assembler::zero, L_exit); |
|
6573 |
|
6574 // Fold the rest of bytes |
|
6575 align(4); |
|
6576 BIND(L_tail_loop); |
|
6577 movsbl(rax, Address(buf, 0)); // load byte with sign extension |
|
6578 update_byte_crc32(crc, rax, table); |
|
6579 increment(buf); |
|
6580 decrementl(len); |
|
6581 jccb(Assembler::greater, L_tail_loop); |
|
6582 |
|
6583 BIND(L_exit); |
|
6584 notl(crc); // ~c |
|
6585 } |
|
6586 |
6391 #undef BIND |
6587 #undef BIND |
6392 #undef BLOCK_COMMENT |
6588 #undef BLOCK_COMMENT |
6393 |
6589 |
6394 |
6590 |
6395 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { |
6591 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { |