1226 __ align(32); |
1226 __ align(32); |
1227 |
1227 |
1228 __ bind(l_10); |
1228 __ bind(l_10); |
1229 // Use loop with VSX load/store instructions to |
1229 // Use loop with VSX load/store instructions to |
1230 // copy 32 elements a time. |
1230 // copy 32 elements a time. |
1231 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src |
1231 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
1232 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst |
1232 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1237 __ bdnz(l_10); // Dec CTR and loop if not zero. |
1237 __ bdnz(l_10); // Dec CTR and loop if not zero. |
1494 __ align(32); |
1494 __ align(32); |
1495 |
1495 |
1496 __ bind(l_9); |
1496 __ bind(l_9); |
1497 // Use loop with VSX load/store instructions to |
1497 // Use loop with VSX load/store instructions to |
1498 // copy 16 elements a time. |
1498 // copy 16 elements a time. |
1499 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. |
1499 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. |
1500 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. |
1500 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. |
1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. |
1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. |
1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. |
1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. |
1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. |
1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. |
1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. |
1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. |
1505 __ bdnz(l_9); // Dec CTR and loop if not zero. |
1505 __ bdnz(l_9); // Dec CTR and loop if not zero. |
1688 __ align(32); |
1688 __ align(32); |
1689 |
1689 |
1690 __ bind(l_7); |
1690 __ bind(l_7); |
1691 // Use loop with VSX load/store instructions to |
1691 // Use loop with VSX load/store instructions to |
1692 // copy 8 elements a time. |
1692 // copy 8 elements a time. |
1693 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src |
1693 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
1694 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst |
1694 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1699 __ bdnz(l_7); // Dec CTR and loop if not zero. |
1699 __ bdnz(l_7); // Dec CTR and loop if not zero. |
1754 // |
1754 // |
1755 void generate_conjoint_int_copy_core(bool aligned) { |
1755 void generate_conjoint_int_copy_core(bool aligned) { |
1756 // Do reverse copy. We assume the case of actual overlap is rare enough |
1756 // Do reverse copy. We assume the case of actual overlap is rare enough |
1757 // that we don't have to optimize it. |
1757 // that we don't have to optimize it. |
1758 |
1758 |
1759 Label l_1, l_2, l_3, l_4, l_5, l_6; |
1759 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; |
1760 |
1760 |
1761 Register tmp1 = R6_ARG4; |
1761 Register tmp1 = R6_ARG4; |
1762 Register tmp2 = R7_ARG5; |
1762 Register tmp2 = R7_ARG5; |
1763 Register tmp3 = R8_ARG6; |
1763 Register tmp3 = R8_ARG6; |
1764 Register tmp4 = R0; |
1764 Register tmp4 = R0; |
1765 |
1765 |
|
1766 VectorSRegister tmp_vsr1 = VSR1; |
|
1767 VectorSRegister tmp_vsr2 = VSR2; |
|
1768 |
1766 { // FasterArrayCopy |
1769 { // FasterArrayCopy |
1767 __ cmpwi(CCR0, R5_ARG3, 0); |
1770 __ cmpwi(CCR0, R5_ARG3, 0); |
1768 __ beq(CCR0, l_6); |
1771 __ beq(CCR0, l_6); |
1769 |
1772 |
1770 __ sldi(R5_ARG3, R5_ARG3, 2); |
1773 __ sldi(R5_ARG3, R5_ARG3, 2); |
1771 __ add(R3_ARG1, R3_ARG1, R5_ARG3); |
1774 __ add(R3_ARG1, R3_ARG1, R5_ARG3); |
1772 __ add(R4_ARG2, R4_ARG2, R5_ARG3); |
1775 __ add(R4_ARG2, R4_ARG2, R5_ARG3); |
1773 __ srdi(R5_ARG3, R5_ARG3, 2); |
1776 __ srdi(R5_ARG3, R5_ARG3, 2); |
1774 |
1777 |
|
1778 if (!aligned) { |
|
1779 // check if arrays have same alignment mod 8. |
|
1780 __ xorr(tmp1, R3_ARG1, R4_ARG2); |
|
1781 __ andi_(R0, tmp1, 7); |
|
1782 // Not the same alignment, but ld and std just need to be 4 byte aligned. |
|
1783 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time |
|
1784 |
|
1785 // copy 1 element to align to and from on an 8 byte boundary |
|
1786 __ andi_(R0, R3_ARG1, 7); |
|
1787 __ beq(CCR0, l_7); |
|
1788 |
|
1789 __ addi(R3_ARG1, R3_ARG1, -4); |
|
1790 __ addi(R4_ARG2, R4_ARG2, -4); |
|
1791 __ addi(R5_ARG3, R5_ARG3, -1); |
|
1792 __ lwzx(tmp2, R3_ARG1); |
|
1793 __ stwx(tmp2, R4_ARG2); |
|
1794 __ bind(l_7); |
|
1795 } |
|
1796 |
1775 __ cmpwi(CCR0, R5_ARG3, 7); |
1797 __ cmpwi(CCR0, R5_ARG3, 7); |
1776 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain |
1798 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain |
1777 |
1799 |
1778 __ srdi(tmp1, R5_ARG3, 3); |
1800 __ srdi(tmp1, R5_ARG3, 3); |
1779 __ andi(R5_ARG3, R5_ARG3, 7); |
1801 __ andi(R5_ARG3, R5_ARG3, 7); |
1780 __ mtctr(tmp1); |
1802 __ mtctr(tmp1); |
1781 |
1803 |
|
1804 if (!VM_Version::has_vsx()) { |
1782 __ bind(l_4); |
1805 __ bind(l_4); |
1783 // Use unrolled version for mass copying (copy 4 elements a time). |
1806 // Use unrolled version for mass copying (copy 4 elements a time). |
1784 // Load feeding store gets zero latency on Power6, however not on Power5. |
1807 // Load feeding store gets zero latency on Power6, however not on Power5. |
1785 // Therefore, the following sequence is made for the good of both. |
1808 // Therefore, the following sequence is made for the good of both. |
1786 __ addi(R3_ARG1, R3_ARG1, -32); |
1809 __ addi(R3_ARG1, R3_ARG1, -32); |
1792 __ std(tmp4, 24, R4_ARG2); |
1815 __ std(tmp4, 24, R4_ARG2); |
1793 __ std(tmp3, 16, R4_ARG2); |
1816 __ std(tmp3, 16, R4_ARG2); |
1794 __ std(tmp2, 8, R4_ARG2); |
1817 __ std(tmp2, 8, R4_ARG2); |
1795 __ std(tmp1, 0, R4_ARG2); |
1818 __ std(tmp1, 0, R4_ARG2); |
1796 __ bdnz(l_4); |
1819 __ bdnz(l_4); |
|
1820 } else { // Processor supports VSX, so use it to mass copy. |
|
1821 // Prefetch the data into the L2 cache. |
|
1822 __ dcbt(R3_ARG1, 0); |
|
1823 |
|
1824 // If supported set DSCR pre-fetch to deepest. |
|
1825 if (VM_Version::has_mfdscr()) { |
|
1826 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
|
1827 __ mtdscr(tmp2); |
|
1828 } |
|
1829 |
|
1830 __ li(tmp1, 16); |
|
1831 |
|
1832 // Backbranch target aligned to 32-byte. Not 16-byte align as |
|
1833 // loop contains < 8 instructions that fit inside a single |
|
1834 // i-cache sector. |
|
1835 __ align(32); |
|
1836 |
|
1837 __ bind(l_4); |
|
1838 // Use loop with VSX load/store instructions to |
|
1839 // copy 8 elements a time. |
|
1840 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 |
|
1841 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 |
|
1842 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 |
|
1843 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
|
1844 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 |
|
1845 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
|
1846 __ bdnz(l_4); |
|
1847 |
|
1848 // Restore DSCR pre-fetch value. |
|
1849 if (VM_Version::has_mfdscr()) { |
|
1850 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
1851 __ mtdscr(tmp2); |
|
1852 } |
|
1853 } |
1797 |
1854 |
1798 __ cmpwi(CCR0, R5_ARG3, 0); |
1855 __ cmpwi(CCR0, R5_ARG3, 0); |
1799 __ beq(CCR0, l_6); |
1856 __ beq(CCR0, l_6); |
1800 |
1857 |
1801 __ bind(l_5); |
1858 __ bind(l_5); |
1906 __ align(32); |
1963 __ align(32); |
1907 |
1964 |
1908 __ bind(l_5); |
1965 __ bind(l_5); |
1909 // Use loop with VSX load/store instructions to |
1966 // Use loop with VSX load/store instructions to |
1910 // copy 4 elements a time. |
1967 // copy 4 elements a time. |
1911 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src |
1968 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
1912 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst |
1969 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
1913 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1970 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1914 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1971 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1915 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1972 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1916 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1973 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
1917 __ bdnz(l_5); // Dec CTR and loop if not zero. |
1974 __ bdnz(l_5); // Dec CTR and loop if not zero. |
1992 |
2052 |
1993 __ srdi(tmp1, R5_ARG3, 2); |
2053 __ srdi(tmp1, R5_ARG3, 2); |
1994 __ andi(R5_ARG3, R5_ARG3, 3); |
2054 __ andi(R5_ARG3, R5_ARG3, 3); |
1995 __ mtctr(tmp1); |
2055 __ mtctr(tmp1); |
1996 |
2056 |
|
2057 if (!VM_Version::has_vsx()) { |
1997 __ bind(l_4); |
2058 __ bind(l_4); |
1998 // Use unrolled version for mass copying (copy 4 elements a time). |
2059 // Use unrolled version for mass copying (copy 4 elements a time). |
1999 // Load feeding store gets zero latency on Power6, however not on Power5. |
2060 // Load feeding store gets zero latency on Power6, however not on Power5. |
2000 // Therefore, the following sequence is made for the good of both. |
2061 // Therefore, the following sequence is made for the good of both. |
2001 __ addi(R3_ARG1, R3_ARG1, -32); |
2062 __ addi(R3_ARG1, R3_ARG1, -32); |
2007 __ std(tmp4, 24, R4_ARG2); |
2068 __ std(tmp4, 24, R4_ARG2); |
2008 __ std(tmp3, 16, R4_ARG2); |
2069 __ std(tmp3, 16, R4_ARG2); |
2009 __ std(tmp2, 8, R4_ARG2); |
2070 __ std(tmp2, 8, R4_ARG2); |
2010 __ std(tmp1, 0, R4_ARG2); |
2071 __ std(tmp1, 0, R4_ARG2); |
2011 __ bdnz(l_4); |
2072 __ bdnz(l_4); |
|
2073 } else { // Processor supports VSX, so use it to mass copy. |
|
2074 // Prefetch the data into the L2 cache. |
|
2075 __ dcbt(R3_ARG1, 0); |
|
2076 |
|
2077 // If supported set DSCR pre-fetch to deepest. |
|
2078 if (VM_Version::has_mfdscr()) { |
|
2079 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
|
2080 __ mtdscr(tmp2); |
|
2081 } |
|
2082 |
|
2083 __ li(tmp1, 16); |
|
2084 |
|
2085 // Backbranch target aligned to 32-byte. Not 16-byte align as |
|
2086 // loop contains < 8 instructions that fit inside a single |
|
2087 // i-cache sector. |
|
2088 __ align(32); |
|
2089 |
|
2090 __ bind(l_4); |
|
2091 // Use loop with VSX load/store instructions to |
|
2092 // copy 4 elements a time. |
|
2093 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 |
|
2094 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 |
|
2095 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 |
|
2096 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
|
2097 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 |
|
2098 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
|
2099 __ bdnz(l_4); |
|
2100 |
|
2101 // Restore DSCR pre-fetch value. |
|
2102 if (VM_Version::has_mfdscr()) { |
|
2103 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
2104 __ mtdscr(tmp2); |
|
2105 } |
|
2106 } |
2012 |
2107 |
2013 __ cmpwi(CCR0, R5_ARG3, 0); |
2108 __ cmpwi(CCR0, R5_ARG3, 0); |
2014 __ beq(CCR0, l_1); |
2109 __ beq(CCR0, l_1); |
2015 |
2110 |
2016 __ bind(l_5); |
2111 __ bind(l_5); |