Mon, 10 Oct 2016 14:44:09 +0800
#4535 Loop unrolling for disjoint_short_copy
Reviewed-by: aoqi
The improvement for SPECjvm2008 is not obvious.
src/cpu/mips/vm/stubGenerator_mips_64.cpp | file | annotate | diff | comparison | revisions |
1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sun Oct 09 15:36:29 2016 +0800 1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Mon Oct 10 14:44:09 2016 +0800 1.3 @@ -971,27 +971,7 @@ 1.4 // left. (l_6) 1.5 // 7. copy 1 element at a time. (l_5) 1.6 // 8. copy last element if one was left in step 6. (l_1) 1.7 - // 1.8 - // TODO: 1.9 - // 1.10 - // 1. use loongson 128-bit load/store 1.11 - // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000: 1.12 - // __ bind(l_x); 1.13 - // __ ld(AT, tmp1, 0); 1.14 - // __ ld(tmp, tmp1, 8); 1.15 - // __ sd(AT, tmp2, 0); 1.16 - // __ sd(tmp, tmp2, 8); 1.17 - // __ ld(AT, tmp1, 16); 1.18 - // __ ld(tmp, tmp1, 24); 1.19 - // __ sd(AT, tmp2, 16); 1.20 - // __ sd(tmp, tmp2, 24); 1.21 - // __ daddi(tmp1, tmp1, 32); 1.22 - // __ daddi(tmp2, tmp2, 32); 1.23 - // __ daddi(tmp3, tmp3, -16); 1.24 - // __ daddi(AT, tmp3, -16); 1.25 - // __ bgez(AT, l_x); 1.26 - // __ delayed()->nop(); 1.27 - // 1.28 + 1.29 address generate_disjoint_short_copy(bool aligned, const char * name) { 1.30 StubCodeMark mark(this, "StubRoutines", name); 1.31 __ align(CodeEntryAlignment); 1.32 @@ -1000,23 +980,28 @@ 1.33 Register tmp2 = T1; 1.34 Register tmp3 = T3; 1.35 Register tmp4 = T8; 1.36 + Register tmp5 = T9; 1.37 + Register tmp6 = T2; 1.38 1.39 address start = __ pc(); 1.40 1.41 __ push(tmp1); 1.42 __ push(tmp2); 1.43 __ push(tmp3); 1.44 - __ push(tmp4); 1.45 __ move(tmp1, A0); 1.46 __ move(tmp2, A1); 1.47 __ move(tmp3, A2); 1.48 1.49 - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11; 1.50 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14; 1.51 Label l_debug; 1.52 // don't try anything fancy if arrays don't have many elements 1.53 - __ daddi(AT, tmp3, -9); 1.54 - __ blez(AT, l_1); 1.55 + __ daddi(AT, tmp3, -23); 1.56 + __ blez(AT, l_14); 1.57 __ delayed()->nop(); 1.58 + // move push here 1.59 + __ push(tmp4); 1.60 + __ push(tmp5); 1.61 + __ push(tmp6); 1.62 1.63 if (!aligned) { 1.64 __ xorr(AT, A0, A1); 1.65 @@ -1060,115 +1045,143 @@ 1.66 __ lw(AT, tmp1, 0); 1.67 __ daddi(tmp3, tmp3, -2); 1.68 __ sw(AT, tmp2, 0); 1.69 - { // FasterArrayCopy 1.70 - __ daddi(tmp1, tmp1, 4); 1.71 - __ daddi(tmp2, tmp2, 4); 1.72 - } 1.73 - }//end of if 1.74 + __ daddi(tmp1, tmp1, 4); 1.75 + __ daddi(tmp2, tmp2, 4); 1.76 + }// end of if (!aligned) 1.77 1.78 - __ bind(l_7); 1.79 + __ bind(l_7); 1.80 + // At this time the position of both, from and to, are at least 8 byte aligned. 1.81 + // Copy 8 elemnets at a time. 1.82 + // Align to 16 bytes, but only if both from and to have same alignment mod 8. 1.83 + __ xorr(AT, tmp1, tmp2); 1.84 + __ andi(AT, AT, 15); 1.85 + __ bne(AT, R0, l_9); 1.86 + __ delayed()->nop(); 1.87 1.88 - // At this time the position of both, from and to, are at least 8 byte aligned. 1.89 + // Copy 4-element word if necessary to align to 16 bytes, 1.90 + __ andi(AT, tmp1, 15); 1.91 + __ beq(AT, R0, l_10); 1.92 + __ delayed()->nop(); 1.93 1.94 - // Copy 8 elemnets at a time. 1.95 - // Align to 16 bytes, but only if both from and to have same alignment mod 8. 1.96 - __ xorr(AT, tmp1, tmp2); 1.97 - __ andi(AT, AT, 15); 1.98 - __ bne(AT, R0, l_9); 1.99 - __ delayed()->nop(); 1.100 + __ ld(AT, tmp1, 0); 1.101 + __ daddi(tmp3, tmp3, -4); 1.102 + __ sd(AT, tmp2, 0); 1.103 + __ daddi(tmp1, tmp1, 8); 1.104 + __ daddi(tmp2, tmp2, 8); 1.105 1.106 - // Copy 4-element word if necessary to align to 16 bytes, 1.107 - __ andi(AT, tmp1, 15); 1.108 - __ beq(AT, R0, l_10); 1.109 - __ delayed()->nop(); 1.110 - 1.111 - __ ld(AT, tmp1, 0); 1.112 - __ daddi(tmp3, tmp3, -4); 1.113 - __ sd(AT, tmp2, 0); 1.114 - { // FasterArrayCopy 1.115 - __ daddi(tmp1, tmp1, 8); 1.116 - __ daddi(tmp2, tmp2, 8); 1.117 - } 1.118 - 1.119 - __ bind(l_10); 1.120 + __ bind(l_10); 1.121 1.122 // Copy 8 elements at a time; either the loads or the stores can 1.123 // be unalligned if aligned == false 1.124 1.125 { // FasterArrayCopy 1.126 - __ daddi(AT, tmp3, -15); 1.127 - __ blez(AT, l_9); 1.128 - __ delayed()->nop(); 1.129 - 1.130 __ bind(l_11); 1.131 // For loongson the 128-bit memory access instruction is gslq/gssq 1.132 __ gslq(AT, tmp4, tmp1, 0); 1.133 - __ daddi(tmp1, tmp1, 16); 1.134 - __ daddi(tmp3, tmp3, -8); 1.135 - __ daddi(tmp2, tmp2, 16); 1.136 - __ gssq(AT, tmp4, tmp2, -16); 1.137 - __ daddi(AT, tmp3, -8); 1.138 + __ gslq(tmp5, tmp6, tmp1, 16); 1.139 + __ daddi(tmp1, tmp1, 32); 1.140 + __ daddi(tmp2, tmp2, 32); 1.141 + __ gssq(AT, tmp4, tmp2, -32); 1.142 + __ gssq(tmp5, tmp6, tmp2, -16); 1.143 + __ daddi(tmp3, tmp3, -16); 1.144 + __ daddi(AT, tmp3, -16); 1.145 __ bgez(AT, l_11); 1.146 __ delayed()->nop(); 1.147 } 1.148 __ bind(l_9); 1.149 + 1.150 // Copy 4 elements at a time; either the loads or the stores can 1.151 // be unaligned if aligned == false. 1.152 - 1.153 { // FasterArrayCopy 1.154 - __ daddi(AT, tmp3, -3); 1.155 - __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain 1.156 + __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16 1.157 + __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain 1.158 __ delayed()->nop(); 1.159 1.160 __ bind(l_8); 1.161 __ ld(AT, tmp1, 0); 1.162 + __ ld(tmp4, tmp1, 8); 1.163 + __ ld(tmp5, tmp1, 16); 1.164 + __ ld(tmp6, tmp1, 24); 1.165 __ sd(AT, tmp2, 0); 1.166 - __ daddi(tmp1, tmp1, 8); 1.167 - __ daddi(tmp2, tmp2, 8); 1.168 - __ daddi(tmp3, tmp3, -4); 1.169 - __ daddi(AT, tmp3, -4); 1.170 + __ sd(tmp4, tmp2, 8); 1.171 + __ sd(tmp5, tmp2,16); 1.172 + __ daddi(tmp1, tmp1, 32); 1.173 + __ daddi(tmp2, tmp2, 32); 1.174 + __ daddi(tmp3, tmp3, -16); 1.175 + __ daddi(AT, tmp3, -16); 1.176 __ bgez(AT, l_8); 1.177 - __ delayed()->nop(); 1.178 + __ sd(tmp6, tmp2, -8); 1.179 } 1.180 __ bind(l_6); 1.181 1.182 // copy 2 element at a time 1.183 { // FasterArrayCopy 1.184 - __ daddi(AT, tmp3, -1); 1.185 - __ blez(AT, l_1); 1.186 + __ daddi(AT, tmp3, -7); 1.187 + __ blez(AT, l_4); 1.188 __ delayed()->nop(); 1.189 1.190 __ bind(l_3); 1.191 __ lw(AT, tmp1, 0); 1.192 + __ lw(tmp4, tmp1, 4); 1.193 + __ lw(tmp5, tmp1, 8); 1.194 + __ lw(tmp6, tmp1, 12); 1.195 __ sw(AT, tmp2, 0); 1.196 - __ daddi(tmp1, tmp1, 4); 1.197 - __ daddi(tmp2, tmp2, 4); 1.198 - __ daddi(tmp3, tmp3, -2); 1.199 - __ daddi(AT, tmp3, -2); 1.200 + __ sw(tmp4, tmp2, 4); 1.201 + __ sw(tmp5, tmp2, 8); 1.202 + __ daddi(tmp1, tmp1, 16); 1.203 + __ daddi(tmp2, tmp2, 16); 1.204 + __ daddi(tmp3, tmp3, -8); 1.205 + __ daddi(AT, tmp3, -8); 1.206 __ bgez(AT, l_3); 1.207 - __ delayed()->nop(); 1.208 - 1.209 + __ sw(tmp6, tmp2, -4); 1.210 } 1.211 1.212 + __ bind(l_1); 1.213 // do single element copy (8 bit), can this happen? 1.214 - __ bind(l_1); 1.215 - __ beq(R0, tmp3, l_4); 1.216 - __ delayed()->nop(); 1.217 - 1.218 { // FasterArrayCopy 1.219 + __ daddi(AT, tmp3, -3); 1.220 + __ blez(AT, l_4); 1.221 + __ delayed()->nop(); 1.222 1.223 __ bind(l_5); 1.224 __ lhu(AT, tmp1, 0); 1.225 - __ daddi(tmp3, tmp3, -1); 1.226 + __ lhu(tmp4, tmp1, 2); 1.227 + __ lhu(tmp5, tmp1, 4); 1.228 + __ lhu(tmp6, tmp1, 6); 1.229 + __ sh(AT, tmp2, 0); 1.230 + __ sh(tmp4, tmp2, 2); 1.231 + __ sh(tmp5, tmp2, 4); 1.232 + __ daddi(tmp1, tmp1, 8); 1.233 + __ daddi(tmp2, tmp2, 8); 1.234 + __ daddi(tmp3, tmp3, -4); 1.235 + __ daddi(AT, tmp3, -4); 1.236 + __ bgez(AT, l_5); 1.237 + __ sh(tmp6, tmp2, -2); 1.238 + } 1.239 + // single element 1.240 + __ bind(l_4); 1.241 + 1.242 + __ pop(tmp6); 1.243 + __ pop(tmp5); 1.244 + __ pop(tmp4); 1.245 + 1.246 + __ bind(l_14); 1.247 + { // FasterArrayCopy 1.248 + __ beq(R0, tmp3, l_13); 1.249 + __ delayed()->nop(); 1.250 + 1.251 + __ bind(l_12); 1.252 + __ lhu(AT, tmp1, 0); 1.253 __ sh(AT, tmp2, 0); 1.254 __ daddi(tmp1, tmp1, 2); 1.255 __ daddi(tmp2, tmp2, 2); 1.256 + __ daddi(tmp3, tmp3, -1); 1.257 __ daddi(AT, tmp3, -1); 1.258 - __ bgez(AT, l_5); 1.259 + __ bgez(AT, l_12); 1.260 __ delayed()->nop(); 1.261 } 1.262 - __ bind(l_4); 1.263 - __ pop(tmp4); 1.264 + 1.265 + __ bind(l_13); 1.266 __ pop(tmp3); 1.267 __ pop(tmp2); 1.268 __ pop(tmp1);