627 // |
627 // |
628 // Side Effects: |
628 // Side Effects: |
629 // disjoint_byte_copy_entry is set to the no-overlap entry point |
629 // disjoint_byte_copy_entry is set to the no-overlap entry point |
630 // used by generate_conjoint_byte_copy(). |
630 // used by generate_conjoint_byte_copy(). |
631 // |
631 // |
632 address generate_disjoint_byte_copy(bool aligned, const char *name) { |
632 address generate_disjoint_byte_copy(bool aligned, const char * name) { |
633 StubCodeMark mark(this, "StubRoutines", name); |
633 StubCodeMark mark(this, "StubRoutines", name); |
634 __ align(CodeEntryAlignment); |
634 __ align(CodeEntryAlignment); |
635 address start = __ pc(); |
635 |
636 Label l_0, l_1, l_2, l_3, l_4, l_5, l_6; |
636 |
637 |
637 Register tmp1 = T0; |
638 __ push(T3); |
638 Register tmp2 = T1; |
639 __ push(T0); |
639 Register tmp3 = T3; |
640 __ push(T1); |
640 |
641 __ push(T8); |
641 address start = __ pc(); |
642 __ move(T3, A0); |
642 |
643 __ move(T0, A1); |
643 __ push(tmp1); |
644 __ move(T1, A2); |
644 __ push(tmp2); |
645 __ move(T8, T1); // original count in T1 |
645 __ push(tmp3); |
646 __ daddi(AT, T1, -3); |
646 __ move(tmp1, A0); |
647 __ blez(AT, l_4); |
647 __ move(tmp2, A1); |
648 __ delayed()->nop(); |
648 __ move(tmp3, A2); |
649 if (!aligned) { |
649 |
650 //TODO: copy 8 bytes at one time |
650 |
651 // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */ |
651 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11; |
652 __ andi(AT, T3, 3); |
652 Label l_debug; |
653 __ andi(T9, T0, 3); |
653 |
654 __ bne(AT, T9, l_5); |
654 __ daddi(AT, tmp3, -9); //why the number is 9 ? |
655 __ delayed()->nop(); |
655 __ blez(AT, l_9); |
656 |
656 __ delayed()->nop(); |
657 // align source address at dword address boundary |
657 |
658 __ move(T1, 4); |
658 if (!aligned) { |
659 __ sub(T1, T1, T3); |
659 __ xorr(AT, tmp1, tmp2); |
660 __ andi(T1, T1, 3); |
660 __ andi(AT, AT, 1); |
661 __ beq(T1, R0, l_1); |
661 __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy |
662 __ delayed()->nop(); |
662 __ delayed()->nop(); |
663 __ sub(T8,T8,T1); |
663 |
664 __ bind(l_0); |
664 __ andi(AT, tmp1, 1); |
665 __ lb(AT, T3, 0); |
665 __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes |
666 __ sb(AT, T0, 0); |
666 __ delayed()->nop(); |
667 __ addi(T3, T3, 1); |
667 |
668 __ addi(T0, T0, 1); |
668 __ lb(AT, tmp1, 0); |
669 __ addi(T1 ,T1, -1); |
669 __ daddi(tmp1, tmp1, 1); |
670 __ bne(T1, R0, l_0); |
670 __ sb(AT, tmp2, 0); |
671 __ delayed()->nop(); |
671 __ daddi(tmp2, tmp2, 1); |
672 __ bind(l_1); |
672 __ daddi(tmp3, tmp3, -1); |
673 __ move(T1, T8); |
673 __ bind(l_10); |
674 } |
674 |
675 __ shr(T1, 2); |
675 __ xorr(AT, tmp1, tmp2); |
676 __ beq(T1, R0, l_4); // no dwords to move |
676 __ andi(AT, AT, 3); |
677 __ delayed()->nop(); |
677 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy |
678 // copy aligned dwords |
678 __ delayed()->nop(); |
679 __ bind(l_2); |
679 |
680 __ align(16); |
680 // At this point it is guaranteed that both, from and to have the same alignment mod 4. |
681 __ bind(l_3); |
681 |
682 __ lw(AT, T3, 0); |
682 // Copy 2 elements if necessary to align to 4 bytes. |
683 __ sw(AT, T0, 0 ); |
683 __ andi(AT, tmp1, 3); |
684 __ addi(T3, T3, 4); |
684 __ beq(AT, R0, l_2); |
685 __ addi(T0, T0, 4); |
685 __ delayed()->nop(); |
686 __ addi(T1, T1, -1); |
686 |
687 __ bne(T1, R0, l_3); |
687 __ lhu(AT, tmp1, 0); |
688 __ delayed()->nop(); |
688 __ daddi(tmp1, tmp1, 2); |
689 __ bind(l_4); |
689 __ sh(AT, tmp2, 0); |
690 __ move(T1, T8); |
690 __ daddi(tmp2, tmp2, 2); |
691 __ andi(T1, T1, 3); |
691 __ daddi(tmp3, tmp3, -2); |
692 __ beq(T1, R0, l_6); |
692 __ bind(l_2); |
693 __ delayed()->nop(); |
693 |
694 // copy suffix |
694 // At this point the positions of both, from and to, are at least 4 byte aligned. |
695 __ bind(l_5); |
695 |
696 __ lb(AT, T3, 0); |
696 // Copy 4 elements at a time. |
697 __ sb(AT, T0, 0); |
697 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. |
698 __ addi(T3, T3, 1); |
698 __ xorr(AT, tmp1, tmp2); |
699 __ addi(T0, T0, 1); |
699 __ andi(AT, AT, 7); |
700 __ addi(T1, T1, -1); |
700 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned |
701 __ bne(T1, R0, l_5 ); |
701 __ delayed()->nop(); |
702 __ delayed()->nop(); |
702 |
703 __ bind(l_6); |
703 // Copy a 4 elements if necessary to align to 8 bytes. |
704 __ pop(T8); |
704 __ andi(AT, tmp1, 7); |
705 __ pop(T1); |
705 __ beq(AT, R0, l_7); |
706 __ pop(T0); |
706 __ delayed()->nop(); |
707 __ pop(T3); |
707 |
708 __ jr(RA); |
708 __ lw(AT, tmp1, 0); |
709 __ delayed()->nop(); |
709 __ daddi(tmp3, tmp3, -4); |
710 return start; |
710 __ sw(AT, tmp2, 0); |
|
711 { // FasterArrayCopy |
|
712 __ daddi(tmp1, tmp1, 4); |
|
713 __ daddi(tmp2, tmp2, 4); |
|
714 } |
|
715 } |
|
716 |
|
717 __ bind(l_7); |
|
718 |
|
719 // Copy 4 elements at a time; either the loads or the stores can |
|
720 // be unaligned if aligned == false. |
|
721 |
|
722 { // FasterArrayCopy |
|
723 __ daddi(AT, tmp3, -7); |
|
724 __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain |
|
725 __ delayed()->nop(); |
|
726 |
|
727 __ bind(l_8); |
|
728 // For Loongson, there is 128-bit memory access. TODO |
|
729 __ ld(AT, tmp1, 0); |
|
730 __ sd(AT, tmp2, 0); |
|
731 __ daddi(tmp1, tmp1, 8); |
|
732 __ daddi(tmp2, tmp2, 8); |
|
733 __ daddi(tmp3, tmp3, -8); |
|
734 __ daddi(AT, tmp3, -8); |
|
735 __ bgez(AT, l_8); |
|
736 __ delayed()->nop(); |
|
737 } |
|
738 __ bind(l_6); |
|
739 |
|
740 // copy 4 bytes at a time |
|
741 { // FasterArrayCopy |
|
742 __ daddi(AT, tmp3, -3); |
|
743 __ blez(AT, l_1); |
|
744 __ delayed()->nop(); |
|
745 |
|
746 __ bind(l_3); |
|
747 __ lw(AT, tmp1, 0); |
|
748 __ sw(AT, tmp2, 0); |
|
749 __ daddi(tmp1, tmp1, 4); |
|
750 __ daddi(tmp2, tmp2, 4); |
|
751 __ daddi(tmp3, tmp3, -4); |
|
752 __ daddi(AT, tmp3, -4); |
|
753 __ bgez(AT, l_3); |
|
754 __ delayed()->nop(); |
|
755 |
|
756 } |
|
757 |
|
758 // do 2 bytes copy |
|
759 __ bind(l_1); |
|
760 { |
|
761 __ daddi(AT, tmp3, -1); |
|
762 __ blez(AT, l_9); |
|
763 __ delayed()->nop(); |
|
764 |
|
765 __ bind(l_5); |
|
766 __ lhu(AT, tmp1, 0); |
|
767 __ daddi(tmp3, tmp3, -2); |
|
768 __ sh(AT, tmp2, 0); |
|
769 __ daddi(tmp1, tmp1, 2); |
|
770 __ daddi(tmp2, tmp2, 2); |
|
771 __ daddi(AT, tmp3, -2); |
|
772 __ bgez(AT, l_5); |
|
773 __ delayed()->nop(); |
|
774 } |
|
775 |
|
776 //do 1 element copy--byte |
|
777 __ bind(l_9); |
|
778 __ beq(R0, tmp3, l_4); |
|
779 __ delayed()->nop(); |
|
780 |
|
781 { |
|
782 __ bind(l_11); |
|
783 __ lb(AT, tmp1, 0); |
|
784 __ daddi(tmp3, tmp3, -1); |
|
785 __ sb(AT, tmp2, 0); |
|
786 __ daddi(tmp1, tmp1, 1); |
|
787 __ daddi(tmp2, tmp2, 1); |
|
788 __ daddi(AT, tmp3, -1); |
|
789 __ bgez(AT, l_11); |
|
790 __ delayed()->nop(); |
|
791 } |
|
792 |
|
793 __ bind(l_4); |
|
794 __ pop(tmp3); |
|
795 __ pop(tmp2); |
|
796 __ pop(tmp1); |
|
797 |
|
798 __ jr(RA); |
|
799 __ delayed()->nop(); |
|
800 |
|
801 return start; |
711 } |
802 } |
712 |
803 |
713 // Arguments: |
804 // Arguments: |
714 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
805 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
715 // ignored |
806 // ignored |