704 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
713 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
705 // ignored |
714 // ignored |
706 // name - stub name string |
715 // name - stub name string |
707 // |
716 // |
708 // Inputs: |
717 // Inputs: |
709 // c_rarg0 - source array address |
718 // A0 - source array address |
710 // c_rarg1 - destination array address |
719 // A1 - destination array address |
711 // c_rarg2 - element count, treated as ssize_t, can be zero |
720 // A2 - element count, treated as ssize_t, can be zero |
712 // |
721 // |
713 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
722 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
714 // we let the hardware handle it. The one to eight bytes within words, |
723 // we let the hardware handle it. The one to eight bytes within words, |
715 // dwords or qwords that span cache line boundaries will still be loaded |
724 // dwords or qwords that span cache line boundaries will still be loaded |
716 // and stored atomically. |
725 // and stored atomically. |
717 // |
726 // |
718 address generate_conjoint_byte_copy(bool aligned, const char *name) { |
727 address generate_conjoint_byte_copy(bool aligned, const char *name) { |
719 Label l_1, l_2, l_3, l_4, l_5; |
728 __ align(CodeEntryAlignment); |
720 Label l_unaligned, l_aligned; |
729 StubCodeMark mark(this, "StubRoutines", name); |
721 StubCodeMark mark(this, "StubRoutines", name); |
730 address start = __ pc(); |
722 __ align(CodeEntryAlignment); |
731 |
723 address start = __ pc(); |
732 Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit; |
724 address nooverlap_target = aligned ? |
733 Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned; |
725 StubRoutines::arrayof_jbyte_disjoint_arraycopy() : |
734 |
726 StubRoutines::jbyte_disjoint_arraycopy(); |
735 address nooverlap_target = aligned ? |
727 |
736 StubRoutines::arrayof_jbyte_disjoint_arraycopy() : |
728 array_overlap_test(nooverlap_target, 0); |
737 StubRoutines::jbyte_disjoint_arraycopy(); |
729 |
738 |
730 __ push(T3); |
739 array_overlap_test(nooverlap_target, 0); |
731 __ push(T0); |
740 |
732 __ push(T1); |
741 const Register from = A0; // source array address |
733 __ push(T8); |
742 const Register to = A1; // destination array address |
734 |
743 const Register count = A2; // elements count |
735 // copy from high to low |
744 const Register end_from = T3; // source array end address |
736 __ move(T3, A0); |
745 const Register end_to = T0; // destination array end address |
737 __ move(T0, A1); |
746 const Register end_count = T1; // destination array end address |
738 __ move(T1, A2); |
747 |
739 __ dadd(T3, T3, T1); |
748 __ push(end_from); |
740 __ dadd(T0, T0, T1); |
749 __ push(end_to); |
741 |
750 __ push(end_count); |
742 // 2016/5/8 Jin: copy starting unalinged bytes |
751 __ push(T8); |
743 __ bind(l_unaligned); |
752 |
744 __ beq(T1, R0, l_5); |
753 // copy from high to low |
745 __ delayed()->nop(); |
754 __ move(end_count, count); |
746 |
755 __ dadd(end_from, from, end_count); |
747 __ andi(AT, T3, 3); |
756 __ dadd(end_to, to, end_count); |
748 __ beq(AT, R0, l_aligned); |
757 |
749 __ delayed()->nop(); |
758 // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed. |
750 __ lb(AT, T3, -1); |
759 __ andi(AT, end_from, 3); |
751 __ sb(AT, T0, -1); |
760 __ andi(T8, end_to, 3); |
752 __ daddi(AT, T1, -1); |
761 __ bne(AT, T8, l_copy_byte); |
753 __ daddi(AT, T3, -1); |
762 __ delayed()->nop(); |
754 __ daddi(AT, T0, -1); |
763 |
755 __ b(l_unaligned); |
764 // First deal with the unaligned data at the top. |
756 __ delayed()->nop(); |
765 __ bind(l_unaligned); |
757 |
766 __ beq(end_count, R0, l_exit); |
758 // now T0, T3 point to 4-byte aligned high-ends |
767 __ delayed()->nop(); |
759 // T1 contains byte count that is not copied. |
768 |
760 __ bind(l_aligned); |
769 __ andi(AT, end_from, 3); |
761 |
770 __ bne(AT, R0, l_from_unaligned); |
762 __ move(T8, T1); |
771 __ delayed()->nop(); |
763 __ daddi(AT, T1, -3); |
772 |
764 __ blez(AT, l_3); |
773 __ andi(AT, end_to, 3); |
765 __ delayed()->nop(); |
774 __ beq(AT, R0, l_4_bytes_aligned); |
766 |
775 __ delayed()->nop(); |
767 __ andi(T8, T8, 3); |
776 |
768 __ lea(T3, Address(T3, -4)); |
777 __ bind(l_from_unaligned); |
769 __ lea(T0, Address(T0, -4)); |
778 __ lb(AT, end_from, -1); |
770 |
779 __ sb(AT, end_to, -1); |
771 __ dsrl(T1, T1, 2); |
780 __ daddi(end_from, end_from, -1); |
772 __ align(16); |
781 __ daddi(end_to, end_to, -1); |
773 __ bind(l_1); |
782 __ daddi(end_count, end_count, -1); |
774 __ lw(AT, T3, 0); |
783 __ b(l_unaligned); |
775 __ sw(AT, T0, 0); |
784 __ delayed()->nop(); |
776 __ addi(T3, T3, -4); |
785 |
777 __ addi(T0, T0, -4); |
786 // now end_to, end_from point to 4-byte aligned high-ends |
778 __ addi(T1, T1, -1); |
787 // end_count contains byte count that is not copied. |
779 __ bne(T1, R0, l_1); |
788 // copy 4 bytes at a time |
780 __ delayed()->nop(); |
789 __ bind(l_4_bytes_aligned); |
781 __ b(l_3); |
790 |
782 __ delayed()->nop(); |
791 __ move(T8, end_count); |
783 // copy dwords aligned or not with repeat move |
792 __ daddi(AT, end_count, -3); |
784 __ bind(l_2); |
793 __ blez(AT, l_copy_suffix); |
785 __ bind(l_3); |
794 __ delayed()->nop(); |
786 // copy suffix (0-3 bytes) |
795 |
787 __ andi(T8, T8, 3); |
796 //__ andi(T8, T8, 3); |
788 __ beq(T8, R0, l_5); |
797 __ lea(end_from, Address(end_from, -4)); |
789 __ delayed()->nop(); |
798 __ lea(end_to, Address(end_to, -4)); |
790 __ addi(T3, T3, 3); |
799 |
791 __ addi(T0, T0, 3); |
800 __ dsrl(end_count, end_count, 2); |
792 __ bind(l_4); |
801 __ align(16); |
793 __ lb(AT, T3, 0); |
802 __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes |
794 __ sb(AT, T0, 0); |
803 __ lw(AT, end_from, 0); |
795 __ addi(T3, T3, -1); |
804 __ sw(AT, end_to, 0); |
796 __ addi(T0, T0, -1); |
805 __ addi(end_from, end_from, -4); |
797 __ addi(T8, T8, -1); |
806 __ addi(end_to, end_to, -4); |
798 __ bne(T8, R0, l_4); |
807 __ addi(end_count, end_count, -1); |
799 __ delayed()->nop(); |
808 __ bne(end_count, R0, l_copy_4_bytes_loop); |
800 __ bind(l_5); |
809 __ delayed()->nop(); |
801 __ pop(T8); |
810 |
802 __ pop(T1); |
811 __ b(l_copy_suffix); |
803 __ pop(T0); |
812 __ delayed()->nop(); |
804 __ pop(T3); |
813 // copy dwords aligned or not with repeat move |
805 __ jr(RA); |
814 // l_copy_suffix |
806 __ delayed()->nop(); |
815 // copy suffix (0-3 bytes) |
807 return start; |
816 __ bind(l_copy_suffix); |
|
817 __ andi(T8, T8, 3); |
|
818 __ beq(T8, R0, l_exit); |
|
819 __ delayed()->nop(); |
|
820 __ addi(end_from, end_from, 3); |
|
821 __ addi(end_to, end_to, 3); |
|
822 __ bind(l_copy_suffix_loop); |
|
823 __ lb(AT, end_from, 0); |
|
824 __ sb(AT, end_to, 0); |
|
825 __ addi(end_from, end_from, -1); |
|
826 __ addi(end_to, end_to, -1); |
|
827 __ addi(T8, T8, -1); |
|
828 __ bne(T8, R0, l_copy_suffix_loop); |
|
829 __ delayed()->nop(); |
|
830 |
|
831 __ bind(l_copy_byte); |
|
832 __ beq(end_count, R0, l_exit); |
|
833 __ delayed()->nop(); |
|
834 __ lb(AT, end_from, -1); |
|
835 __ sb(AT, end_to, -1); |
|
836 __ daddi(end_from, end_from, -1); |
|
837 __ daddi(end_to, end_to, -1); |
|
838 __ daddi(end_count, end_count, -1); |
|
839 __ b(l_copy_byte); |
|
840 __ delayed()->nop(); |
|
841 |
|
842 __ bind(l_exit); |
|
843 __ pop(T8); |
|
844 __ pop(end_count); |
|
845 __ pop(end_to); |
|
846 __ pop(end_from); |
|
847 __ jr(RA); |
|
848 __ delayed()->nop(); |
|
849 return start; |
808 } |
850 } |
809 |
851 |
810 // Arguments: |
852 // Arguments: |
811 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
853 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
812 // ignored |
854 // ignored |