7291 bind(L_copy_1_char_exit); |
7291 bind(L_copy_1_char_exit); |
7292 addptr(result, len); // len is negative count of not processed elements |
7292 addptr(result, len); // len is negative count of not processed elements |
7293 bind(L_done); |
7293 bind(L_done); |
7294 } |
7294 } |
7295 |
7295 |
|
7296 #ifdef _LP64 |
|
7297 /** |
|
7298 * Helper for multiply_to_len(). |
|
7299 */ |
|
7300 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { |
|
7301 addq(dest_lo, src1); |
|
7302 adcq(dest_hi, 0); |
|
7303 addq(dest_lo, src2); |
|
7304 adcq(dest_hi, 0); |
|
7305 } |
|
7306 |
|
7307 /** |
|
7308 * Multiply 64 bit by 64 bit first loop. |
|
7309 */ |
|
7310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, |
|
7311 Register y, Register y_idx, Register z, |
|
7312 Register carry, Register product, |
|
7313 Register idx, Register kdx) { |
|
7314 // |
|
7315 // jlong carry, x[], y[], z[]; |
|
7316 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { |
|
7317 // huge_128 product = y[idx] * x[xstart] + carry; |
|
7318 // z[kdx] = (jlong)product; |
|
7319 // carry = (jlong)(product >>> 64); |
|
7320 // } |
|
7321 // z[xstart] = carry; |
|
7322 // |
|
7323 |
|
7324 Label L_first_loop, L_first_loop_exit; |
|
7325 Label L_one_x, L_one_y, L_multiply; |
|
7326 |
|
7327 decrementl(xstart); |
|
7328 jcc(Assembler::negative, L_one_x); |
|
7329 |
|
7330 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); |
|
7331 rorq(x_xstart, 32); // convert big-endian to little-endian |
|
7332 |
|
7333 bind(L_first_loop); |
|
7334 decrementl(idx); |
|
7335 jcc(Assembler::negative, L_first_loop_exit); |
|
7336 decrementl(idx); |
|
7337 jcc(Assembler::negative, L_one_y); |
|
7338 movq(y_idx, Address(y, idx, Address::times_4, 0)); |
|
7339 rorq(y_idx, 32); // convert big-endian to little-endian |
|
7340 bind(L_multiply); |
|
7341 movq(product, x_xstart); |
|
7342 mulq(y_idx); // product(rax) * y_idx -> rdx:rax |
|
7343 addq(product, carry); |
|
7344 adcq(rdx, 0); |
|
7345 subl(kdx, 2); |
|
7346 movl(Address(z, kdx, Address::times_4, 4), product); |
|
7347 shrq(product, 32); |
|
7348 movl(Address(z, kdx, Address::times_4, 0), product); |
|
7349 movq(carry, rdx); |
|
7350 jmp(L_first_loop); |
|
7351 |
|
7352 bind(L_one_y); |
|
7353 movl(y_idx, Address(y, 0)); |
|
7354 jmp(L_multiply); |
|
7355 |
|
7356 bind(L_one_x); |
|
7357 movl(x_xstart, Address(x, 0)); |
|
7358 jmp(L_first_loop); |
|
7359 |
|
7360 bind(L_first_loop_exit); |
|
7361 } |
|
7362 |
|
7363 /** |
|
7364 * Multiply 64 bit by 64 bit and add 128 bit. |
|
7365 */ |
|
7366 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, |
|
7367 Register yz_idx, Register idx, |
|
7368 Register carry, Register product, int offset) { |
|
7369 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; |
|
7370 // z[kdx] = (jlong)product; |
|
7371 |
|
7372 movq(yz_idx, Address(y, idx, Address::times_4, offset)); |
|
7373 rorq(yz_idx, 32); // convert big-endian to little-endian |
|
7374 movq(product, x_xstart); |
|
7375 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) |
|
7376 movq(yz_idx, Address(z, idx, Address::times_4, offset)); |
|
7377 rorq(yz_idx, 32); // convert big-endian to little-endian |
|
7378 |
|
7379 add2_with_carry(rdx, product, carry, yz_idx); |
|
7380 |
|
7381 movl(Address(z, idx, Address::times_4, offset+4), product); |
|
7382 shrq(product, 32); |
|
7383 movl(Address(z, idx, Address::times_4, offset), product); |
|
7384 |
|
7385 } |
|
7386 |
|
7387 /** |
|
7388 * Multiply 128 bit by 128 bit. Unrolled inner loop. |
|
7389 */ |
|
7390 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, |
|
7391 Register yz_idx, Register idx, Register jdx, |
|
7392 Register carry, Register product, |
|
7393 Register carry2) { |
|
7394 // jlong carry, x[], y[], z[]; |
|
7395 // int kdx = ystart+1; |
|
7396 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop |
|
7397 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; |
|
7398 // z[kdx+idx+1] = (jlong)product; |
|
7399 // jlong carry2 = (jlong)(product >>> 64); |
|
7400 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; |
|
7401 // z[kdx+idx] = (jlong)product; |
|
7402 // carry = (jlong)(product >>> 64); |
|
7403 // } |
|
7404 // idx += 2; |
|
7405 // if (idx > 0) { |
|
7406 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; |
|
7407 // z[kdx+idx] = (jlong)product; |
|
7408 // carry = (jlong)(product >>> 64); |
|
7409 // } |
|
7410 // |
|
7411 |
|
7412 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; |
|
7413 |
|
7414 movl(jdx, idx); |
|
7415 andl(jdx, 0xFFFFFFFC); |
|
7416 shrl(jdx, 2); |
|
7417 |
|
7418 bind(L_third_loop); |
|
7419 subl(jdx, 1); |
|
7420 jcc(Assembler::negative, L_third_loop_exit); |
|
7421 subl(idx, 4); |
|
7422 |
|
7423 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); |
|
7424 movq(carry2, rdx); |
|
7425 |
|
7426 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); |
|
7427 movq(carry, rdx); |
|
7428 jmp(L_third_loop); |
|
7429 |
|
7430 bind (L_third_loop_exit); |
|
7431 |
|
7432 andl (idx, 0x3); |
|
7433 jcc(Assembler::zero, L_post_third_loop_done); |
|
7434 |
|
7435 Label L_check_1; |
|
7436 subl(idx, 2); |
|
7437 jcc(Assembler::negative, L_check_1); |
|
7438 |
|
7439 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); |
|
7440 movq(carry, rdx); |
|
7441 |
|
7442 bind (L_check_1); |
|
7443 addl (idx, 0x2); |
|
7444 andl (idx, 0x1); |
|
7445 subl(idx, 1); |
|
7446 jcc(Assembler::negative, L_post_third_loop_done); |
|
7447 |
|
7448 movl(yz_idx, Address(y, idx, Address::times_4, 0)); |
|
7449 movq(product, x_xstart); |
|
7450 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) |
|
7451 movl(yz_idx, Address(z, idx, Address::times_4, 0)); |
|
7452 |
|
7453 add2_with_carry(rdx, product, yz_idx, carry); |
|
7454 |
|
7455 movl(Address(z, idx, Address::times_4, 0), product); |
|
7456 shrq(product, 32); |
|
7457 |
|
7458 shlq(rdx, 32); |
|
7459 orq(product, rdx); |
|
7460 movq(carry, product); |
|
7461 |
|
7462 bind(L_post_third_loop_done); |
|
7463 } |
|
7464 |
|
7465 /** |
|
7466 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. |
|
7467 * |
|
7468 */ |
|
7469 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, |
|
7470 Register carry, Register carry2, |
|
7471 Register idx, Register jdx, |
|
7472 Register yz_idx1, Register yz_idx2, |
|
7473 Register tmp, Register tmp3, Register tmp4) { |
|
7474 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); |
|
7475 |
|
7476 // jlong carry, x[], y[], z[]; |
|
7477 // int kdx = ystart+1; |
|
7478 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop |
|
7479 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; |
|
7480 // jlong carry2 = (jlong)(tmp3 >>> 64); |
|
7481 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; |
|
7482 // carry = (jlong)(tmp4 >>> 64); |
|
7483 // z[kdx+idx+1] = (jlong)tmp3; |
|
7484 // z[kdx+idx] = (jlong)tmp4; |
|
7485 // } |
|
7486 // idx += 2; |
|
7487 // if (idx > 0) { |
|
7488 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; |
|
7489 // z[kdx+idx] = (jlong)yz_idx1; |
|
7490 // carry = (jlong)(yz_idx1 >>> 64); |
|
7491 // } |
|
7492 // |
|
7493 |
|
7494 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; |
|
7495 |
|
7496 movl(jdx, idx); |
|
7497 andl(jdx, 0xFFFFFFFC); |
|
7498 shrl(jdx, 2); |
|
7499 |
|
7500 bind(L_third_loop); |
|
7501 subl(jdx, 1); |
|
7502 jcc(Assembler::negative, L_third_loop_exit); |
|
7503 subl(idx, 4); |
|
7504 |
|
7505 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); |
|
7506 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian |
|
7507 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); |
|
7508 rorxq(yz_idx2, yz_idx2, 32); |
|
7509 |
|
7510 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 |
|
7511 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp |
|
7512 |
|
7513 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); |
|
7514 rorxq(yz_idx1, yz_idx1, 32); |
|
7515 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); |
|
7516 rorxq(yz_idx2, yz_idx2, 32); |
|
7517 |
|
7518 if (VM_Version::supports_adx()) { |
|
7519 adcxq(tmp3, carry); |
|
7520 adoxq(tmp3, yz_idx1); |
|
7521 |
|
7522 adcxq(tmp4, tmp); |
|
7523 adoxq(tmp4, yz_idx2); |
|
7524 |
|
7525 movl(carry, 0); // does not affect flags |
|
7526 adcxq(carry2, carry); |
|
7527 adoxq(carry2, carry); |
|
7528 } else { |
|
7529 add2_with_carry(tmp4, tmp3, carry, yz_idx1); |
|
7530 add2_with_carry(carry2, tmp4, tmp, yz_idx2); |
|
7531 } |
|
7532 movq(carry, carry2); |
|
7533 |
|
7534 movl(Address(z, idx, Address::times_4, 12), tmp3); |
|
7535 shrq(tmp3, 32); |
|
7536 movl(Address(z, idx, Address::times_4, 8), tmp3); |
|
7537 |
|
7538 movl(Address(z, idx, Address::times_4, 4), tmp4); |
|
7539 shrq(tmp4, 32); |
|
7540 movl(Address(z, idx, Address::times_4, 0), tmp4); |
|
7541 |
|
7542 jmp(L_third_loop); |
|
7543 |
|
7544 bind (L_third_loop_exit); |
|
7545 |
|
7546 andl (idx, 0x3); |
|
7547 jcc(Assembler::zero, L_post_third_loop_done); |
|
7548 |
|
7549 Label L_check_1; |
|
7550 subl(idx, 2); |
|
7551 jcc(Assembler::negative, L_check_1); |
|
7552 |
|
7553 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); |
|
7554 rorxq(yz_idx1, yz_idx1, 32); |
|
7555 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 |
|
7556 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); |
|
7557 rorxq(yz_idx2, yz_idx2, 32); |
|
7558 |
|
7559 add2_with_carry(tmp4, tmp3, carry, yz_idx2); |
|
7560 |
|
7561 movl(Address(z, idx, Address::times_4, 4), tmp3); |
|
7562 shrq(tmp3, 32); |
|
7563 movl(Address(z, idx, Address::times_4, 0), tmp3); |
|
7564 movq(carry, tmp4); |
|
7565 |
|
7566 bind (L_check_1); |
|
7567 addl (idx, 0x2); |
|
7568 andl (idx, 0x1); |
|
7569 subl(idx, 1); |
|
7570 jcc(Assembler::negative, L_post_third_loop_done); |
|
7571 movl(tmp4, Address(y, idx, Address::times_4, 0)); |
|
7572 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 |
|
7573 movl(tmp4, Address(z, idx, Address::times_4, 0)); |
|
7574 |
|
7575 add2_with_carry(carry2, tmp3, tmp4, carry); |
|
7576 |
|
7577 movl(Address(z, idx, Address::times_4, 0), tmp3); |
|
7578 shrq(tmp3, 32); |
|
7579 |
|
7580 shlq(carry2, 32); |
|
7581 orq(tmp3, carry2); |
|
7582 movq(carry, tmp3); |
|
7583 |
|
7584 bind(L_post_third_loop_done); |
|
7585 } |
|
7586 |
|
7587 /** |
|
7588 * Code for BigInteger::multiplyToLen() instrinsic. |
|
7589 * |
|
7590 * rdi: x |
|
7591 * rax: xlen |
|
7592 * rsi: y |
|
7593 * rcx: ylen |
|
7594 * r8: z |
|
7595 * r11: zlen |
|
7596 * r12: tmp1 |
|
7597 * r13: tmp2 |
|
7598 * r14: tmp3 |
|
7599 * r15: tmp4 |
|
7600 * rbx: tmp5 |
|
7601 * |
|
7602 */ |
|
7603 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, |
|
7604 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { |
|
7605 ShortBranchVerifier sbv(this); |
|
7606 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); |
|
7607 |
|
7608 push(tmp1); |
|
7609 push(tmp2); |
|
7610 push(tmp3); |
|
7611 push(tmp4); |
|
7612 push(tmp5); |
|
7613 |
|
7614 push(xlen); |
|
7615 push(zlen); |
|
7616 |
|
7617 const Register idx = tmp1; |
|
7618 const Register kdx = tmp2; |
|
7619 const Register xstart = tmp3; |
|
7620 |
|
7621 const Register y_idx = tmp4; |
|
7622 const Register carry = tmp5; |
|
7623 const Register product = xlen; |
|
7624 const Register x_xstart = zlen; // reuse register |
|
7625 |
|
7626 // First Loop. |
|
7627 // |
|
7628 // final static long LONG_MASK = 0xffffffffL; |
|
7629 // int xstart = xlen - 1; |
|
7630 // int ystart = ylen - 1; |
|
7631 // long carry = 0; |
|
7632 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { |
|
7633 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; |
|
7634 // z[kdx] = (int)product; |
|
7635 // carry = product >>> 32; |
|
7636 // } |
|
7637 // z[xstart] = (int)carry; |
|
7638 // |
|
7639 |
|
7640 movl(idx, ylen); // idx = ylen; |
|
7641 movl(kdx, zlen); // kdx = xlen+ylen; |
|
7642 xorq(carry, carry); // carry = 0; |
|
7643 |
|
7644 Label L_done; |
|
7645 |
|
7646 movl(xstart, xlen); |
|
7647 decrementl(xstart); |
|
7648 jcc(Assembler::negative, L_done); |
|
7649 |
|
7650 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); |
|
7651 |
|
7652 Label L_second_loop; |
|
7653 testl(kdx, kdx); |
|
7654 jcc(Assembler::zero, L_second_loop); |
|
7655 |
|
7656 Label L_carry; |
|
7657 subl(kdx, 1); |
|
7658 jcc(Assembler::zero, L_carry); |
|
7659 |
|
7660 movl(Address(z, kdx, Address::times_4, 0), carry); |
|
7661 shrq(carry, 32); |
|
7662 subl(kdx, 1); |
|
7663 |
|
7664 bind(L_carry); |
|
7665 movl(Address(z, kdx, Address::times_4, 0), carry); |
|
7666 |
|
7667 // Second and third (nested) loops. |
|
7668 // |
|
7669 // for (int i = xstart-1; i >= 0; i--) { // Second loop |
|
7670 // carry = 0; |
|
7671 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop |
|
7672 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + |
|
7673 // (z[k] & LONG_MASK) + carry; |
|
7674 // z[k] = (int)product; |
|
7675 // carry = product >>> 32; |
|
7676 // } |
|
7677 // z[i] = (int)carry; |
|
7678 // } |
|
7679 // |
|
7680 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx |
|
7681 |
|
7682 const Register jdx = tmp1; |
|
7683 |
|
7684 bind(L_second_loop); |
|
7685 xorl(carry, carry); // carry = 0; |
|
7686 movl(jdx, ylen); // j = ystart+1 |
|
7687 |
|
7688 subl(xstart, 1); // i = xstart-1; |
|
7689 jcc(Assembler::negative, L_done); |
|
7690 |
|
7691 push (z); |
|
7692 |
|
7693 Label L_last_x; |
|
7694 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j |
|
7695 subl(xstart, 1); // i = xstart-1; |
|
7696 jcc(Assembler::negative, L_last_x); |
|
7697 |
|
7698 if (UseBMI2Instructions) { |
|
7699 movq(rdx, Address(x, xstart, Address::times_4, 0)); |
|
7700 rorxq(rdx, rdx, 32); // convert big-endian to little-endian |
|
7701 } else { |
|
7702 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); |
|
7703 rorq(x_xstart, 32); // convert big-endian to little-endian |
|
7704 } |
|
7705 |
|
7706 Label L_third_loop_prologue; |
|
7707 bind(L_third_loop_prologue); |
|
7708 |
|
7709 push (x); |
|
7710 push (xstart); |
|
7711 push (ylen); |
|
7712 |
|
7713 |
|
7714 if (UseBMI2Instructions) { |
|
7715 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); |
|
7716 } else { // !UseBMI2Instructions |
|
7717 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); |
|
7718 } |
|
7719 |
|
7720 pop(ylen); |
|
7721 pop(xlen); |
|
7722 pop(x); |
|
7723 pop(z); |
|
7724 |
|
7725 movl(tmp3, xlen); |
|
7726 addl(tmp3, 1); |
|
7727 movl(Address(z, tmp3, Address::times_4, 0), carry); |
|
7728 subl(tmp3, 1); |
|
7729 jccb(Assembler::negative, L_done); |
|
7730 |
|
7731 shrq(carry, 32); |
|
7732 movl(Address(z, tmp3, Address::times_4, 0), carry); |
|
7733 jmp(L_second_loop); |
|
7734 |
|
7735 // Next infrequent code is moved outside loops. |
|
7736 bind(L_last_x); |
|
7737 if (UseBMI2Instructions) { |
|
7738 movl(rdx, Address(x, 0)); |
|
7739 } else { |
|
7740 movl(x_xstart, Address(x, 0)); |
|
7741 } |
|
7742 jmp(L_third_loop_prologue); |
|
7743 |
|
7744 bind(L_done); |
|
7745 |
|
7746 pop(zlen); |
|
7747 pop(xlen); |
|
7748 |
|
7749 pop(tmp5); |
|
7750 pop(tmp4); |
|
7751 pop(tmp3); |
|
7752 pop(tmp2); |
|
7753 pop(tmp1); |
|
7754 } |
|
7755 #endif |
|
7756 |
7296 /** |
7757 /** |
7297 * Emits code to update CRC-32 with a byte value according to constants in table |
7758 * Emits code to update CRC-32 with a byte value according to constants in table |
7298 * |
7759 * |
7299 * @param [in,out]crc Register containing the crc. |
7760 * @param [in,out]crc Register containing the crc. |
7300 * @param [in]val Register containing the byte to fold into the CRC. |
7761 * @param [in]val Register containing the byte to fold into the CRC. |