src/cpu/ppc/vm/stubGenerator_ppc.cpp

changeset 9684
69f33959c27f
parent 9680
39678a65a0e8
child 9703
2fdf635bcf28
child 9713
c4567d28f31f
equal deleted inserted replaced
9682:9905a72841d7 9684:69f33959c27f
1226 __ align(32); 1226 __ align(32);
1227 1227
1228 __ bind(l_10); 1228 __ bind(l_10);
1229 // Use loop with VSX load/store instructions to 1229 // Use loop with VSX load/store instructions to
1230 // copy 32 elements a time. 1230 // copy 32 elements a time.
1231 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1231 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1232 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1232 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1237 __ bdnz(l_10); // Dec CTR and loop if not zero. 1237 __ bdnz(l_10); // Dec CTR and loop if not zero.
1494 __ align(32); 1494 __ align(32);
1495 1495
1496 __ bind(l_9); 1496 __ bind(l_9);
1497 // Use loop with VSX load/store instructions to 1497 // Use loop with VSX load/store instructions to
1498 // copy 16 elements a time. 1498 // copy 16 elements a time.
1499 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. 1499 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1500 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. 1500 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. 1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. 1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. 1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. 1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1505 __ bdnz(l_9); // Dec CTR and loop if not zero. 1505 __ bdnz(l_9); // Dec CTR and loop if not zero.
1688 __ align(32); 1688 __ align(32);
1689 1689
1690 __ bind(l_7); 1690 __ bind(l_7);
1691 // Use loop with VSX load/store instructions to 1691 // Use loop with VSX load/store instructions to
1692 // copy 8 elements a time. 1692 // copy 8 elements a time.
1693 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1693 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1694 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1694 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1699 __ bdnz(l_7); // Dec CTR and loop if not zero. 1699 __ bdnz(l_7); // Dec CTR and loop if not zero.
1754 // 1754 //
1755 void generate_conjoint_int_copy_core(bool aligned) { 1755 void generate_conjoint_int_copy_core(bool aligned) {
1756 // Do reverse copy. We assume the case of actual overlap is rare enough 1756 // Do reverse copy. We assume the case of actual overlap is rare enough
1757 // that we don't have to optimize it. 1757 // that we don't have to optimize it.
1758 1758
1759 Label l_1, l_2, l_3, l_4, l_5, l_6; 1759 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1760 1760
1761 Register tmp1 = R6_ARG4; 1761 Register tmp1 = R6_ARG4;
1762 Register tmp2 = R7_ARG5; 1762 Register tmp2 = R7_ARG5;
1763 Register tmp3 = R8_ARG6; 1763 Register tmp3 = R8_ARG6;
1764 Register tmp4 = R0; 1764 Register tmp4 = R0;
1765 1765
1766 VectorSRegister tmp_vsr1 = VSR1;
1767 VectorSRegister tmp_vsr2 = VSR2;
1768
1766 { // FasterArrayCopy 1769 { // FasterArrayCopy
1767 __ cmpwi(CCR0, R5_ARG3, 0); 1770 __ cmpwi(CCR0, R5_ARG3, 0);
1768 __ beq(CCR0, l_6); 1771 __ beq(CCR0, l_6);
1769 1772
1770 __ sldi(R5_ARG3, R5_ARG3, 2); 1773 __ sldi(R5_ARG3, R5_ARG3, 2);
1771 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1774 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1772 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1775 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1773 __ srdi(R5_ARG3, R5_ARG3, 2); 1776 __ srdi(R5_ARG3, R5_ARG3, 2);
1774 1777
1778 if (!aligned) {
1779 // check if arrays have same alignment mod 8.
1780 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1781 __ andi_(R0, tmp1, 7);
1782 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1783 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1784
1785 // copy 1 element to align to and from on an 8 byte boundary
1786 __ andi_(R0, R3_ARG1, 7);
1787 __ beq(CCR0, l_7);
1788
1789 __ addi(R3_ARG1, R3_ARG1, -4);
1790 __ addi(R4_ARG2, R4_ARG2, -4);
1791 __ addi(R5_ARG3, R5_ARG3, -1);
1792 __ lwzx(tmp2, R3_ARG1);
1793 __ stwx(tmp2, R4_ARG2);
1794 __ bind(l_7);
1795 }
1796
1775 __ cmpwi(CCR0, R5_ARG3, 7); 1797 __ cmpwi(CCR0, R5_ARG3, 7);
1776 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain 1798 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1777 1799
1778 __ srdi(tmp1, R5_ARG3, 3); 1800 __ srdi(tmp1, R5_ARG3, 3);
1779 __ andi(R5_ARG3, R5_ARG3, 7); 1801 __ andi(R5_ARG3, R5_ARG3, 7);
1780 __ mtctr(tmp1); 1802 __ mtctr(tmp1);
1781 1803
1804 if (!VM_Version::has_vsx()) {
1782 __ bind(l_4); 1805 __ bind(l_4);
1783 // Use unrolled version for mass copying (copy 4 elements a time). 1806 // Use unrolled version for mass copying (copy 4 elements a time).
1784 // Load feeding store gets zero latency on Power6, however not on Power5. 1807 // Load feeding store gets zero latency on Power6, however not on Power5.
1785 // Therefore, the following sequence is made for the good of both. 1808 // Therefore, the following sequence is made for the good of both.
1786 __ addi(R3_ARG1, R3_ARG1, -32); 1809 __ addi(R3_ARG1, R3_ARG1, -32);
1792 __ std(tmp4, 24, R4_ARG2); 1815 __ std(tmp4, 24, R4_ARG2);
1793 __ std(tmp3, 16, R4_ARG2); 1816 __ std(tmp3, 16, R4_ARG2);
1794 __ std(tmp2, 8, R4_ARG2); 1817 __ std(tmp2, 8, R4_ARG2);
1795 __ std(tmp1, 0, R4_ARG2); 1818 __ std(tmp1, 0, R4_ARG2);
1796 __ bdnz(l_4); 1819 __ bdnz(l_4);
1820 } else { // Processor supports VSX, so use it to mass copy.
1821 // Prefetch the data into the L2 cache.
1822 __ dcbt(R3_ARG1, 0);
1823
1824 // If supported set DSCR pre-fetch to deepest.
1825 if (VM_Version::has_mfdscr()) {
1826 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1827 __ mtdscr(tmp2);
1828 }
1829
1830 __ li(tmp1, 16);
1831
1832 // Backbranch target aligned to 32-byte. Not 16-byte align as
1833 // loop contains < 8 instructions that fit inside a single
1834 // i-cache sector.
1835 __ align(32);
1836
1837 __ bind(l_4);
1838 // Use loop with VSX load/store instructions to
1839 // copy 8 elements a time.
1840 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1841 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1842 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1843 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1844 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1845 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1846 __ bdnz(l_4);
1847
1848 // Restore DSCR pre-fetch value.
1849 if (VM_Version::has_mfdscr()) {
1850 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1851 __ mtdscr(tmp2);
1852 }
1853 }
1797 1854
1798 __ cmpwi(CCR0, R5_ARG3, 0); 1855 __ cmpwi(CCR0, R5_ARG3, 0);
1799 __ beq(CCR0, l_6); 1856 __ beq(CCR0, l_6);
1800 1857
1801 __ bind(l_5); 1858 __ bind(l_5);
1906 __ align(32); 1963 __ align(32);
1907 1964
1908 __ bind(l_5); 1965 __ bind(l_5);
1909 // Use loop with VSX load/store instructions to 1966 // Use loop with VSX load/store instructions to
1910 // copy 4 elements a time. 1967 // copy 4 elements a time.
1911 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1968 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1912 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1969 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1913 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1970 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1914 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1971 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1915 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1972 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1916 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1973 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1917 __ bdnz(l_5); // Dec CTR and loop if not zero. 1974 __ bdnz(l_5); // Dec CTR and loop if not zero.
1974 Register tmp1 = R6_ARG4; 2031 Register tmp1 = R6_ARG4;
1975 Register tmp2 = R7_ARG5; 2032 Register tmp2 = R7_ARG5;
1976 Register tmp3 = R8_ARG6; 2033 Register tmp3 = R8_ARG6;
1977 Register tmp4 = R0; 2034 Register tmp4 = R0;
1978 2035
2036 VectorSRegister tmp_vsr1 = VSR1;
2037 VectorSRegister tmp_vsr2 = VSR2;
2038
1979 Label l_1, l_2, l_3, l_4, l_5; 2039 Label l_1, l_2, l_3, l_4, l_5;
1980 2040
1981 __ cmpwi(CCR0, R5_ARG3, 0); 2041 __ cmpwi(CCR0, R5_ARG3, 0);
1982 __ beq(CCR0, l_1); 2042 __ beq(CCR0, l_1);
1983 2043
1992 2052
1993 __ srdi(tmp1, R5_ARG3, 2); 2053 __ srdi(tmp1, R5_ARG3, 2);
1994 __ andi(R5_ARG3, R5_ARG3, 3); 2054 __ andi(R5_ARG3, R5_ARG3, 3);
1995 __ mtctr(tmp1); 2055 __ mtctr(tmp1);
1996 2056
2057 if (!VM_Version::has_vsx()) {
1997 __ bind(l_4); 2058 __ bind(l_4);
1998 // Use unrolled version for mass copying (copy 4 elements a time). 2059 // Use unrolled version for mass copying (copy 4 elements a time).
1999 // Load feeding store gets zero latency on Power6, however not on Power5. 2060 // Load feeding store gets zero latency on Power6, however not on Power5.
2000 // Therefore, the following sequence is made for the good of both. 2061 // Therefore, the following sequence is made for the good of both.
2001 __ addi(R3_ARG1, R3_ARG1, -32); 2062 __ addi(R3_ARG1, R3_ARG1, -32);
2007 __ std(tmp4, 24, R4_ARG2); 2068 __ std(tmp4, 24, R4_ARG2);
2008 __ std(tmp3, 16, R4_ARG2); 2069 __ std(tmp3, 16, R4_ARG2);
2009 __ std(tmp2, 8, R4_ARG2); 2070 __ std(tmp2, 8, R4_ARG2);
2010 __ std(tmp1, 0, R4_ARG2); 2071 __ std(tmp1, 0, R4_ARG2);
2011 __ bdnz(l_4); 2072 __ bdnz(l_4);
2073 } else { // Processor supports VSX, so use it to mass copy.
2074 // Prefetch the data into the L2 cache.
2075 __ dcbt(R3_ARG1, 0);
2076
2077 // If supported set DSCR pre-fetch to deepest.
2078 if (VM_Version::has_mfdscr()) {
2079 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
2080 __ mtdscr(tmp2);
2081 }
2082
2083 __ li(tmp1, 16);
2084
2085 // Backbranch target aligned to 32-byte. Not 16-byte align as
2086 // loop contains < 8 instructions that fit inside a single
2087 // i-cache sector.
2088 __ align(32);
2089
2090 __ bind(l_4);
2091 // Use loop with VSX load/store instructions to
2092 // copy 4 elements a time.
2093 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
2094 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
2095 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
2096 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
2097 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2098 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
2099 __ bdnz(l_4);
2100
2101 // Restore DSCR pre-fetch value.
2102 if (VM_Version::has_mfdscr()) {
2103 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
2104 __ mtdscr(tmp2);
2105 }
2106 }
2012 2107
2013 __ cmpwi(CCR0, R5_ARG3, 0); 2108 __ cmpwi(CCR0, R5_ARG3, 0);
2014 __ beq(CCR0, l_1); 2109 __ beq(CCR0, l_1);
2015 2110
2016 __ bind(l_5); 2111 __ bind(l_5);

mercurial