#4535 Loop unrolling for disjoint_short_copy

Mon, 10 Oct 2016 14:44:09 +0800

author
chenhaoxuan
date
Mon, 10 Oct 2016 14:44:09 +0800
changeset 126
23a51da62a65
parent 125
14abbb45a7dd
child 127
62d2feb3da1e

#4535 Loop unrolling for disjoint_short_copy
Reviewed-by: aoqi

The improvement for SPECjvm2008 is not obvious.

src/cpu/mips/vm/stubGenerator_mips_64.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Sun Oct 09 15:36:29 2016 +0800
     1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Mon Oct 10 14:44:09 2016 +0800
     1.3 @@ -971,27 +971,7 @@
     1.4    //        left. (l_6)
     1.5    //     7. copy 1 element at a time. (l_5)
     1.6    //     8. copy last element if one was left in step 6. (l_1)
     1.7 -  //
     1.8 -  //  TODO:
     1.9 -  //
    1.10 -  //  1. use loongson 128-bit load/store
    1.11 -  //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
    1.12 -  //    __ bind(l_x);
    1.13 -  //    __ ld(AT, tmp1, 0);
    1.14 -  //    __ ld(tmp, tmp1, 8);
    1.15 -  //    __ sd(AT, tmp2, 0);
    1.16 -  //    __ sd(tmp, tmp2, 8);
    1.17 -  //    __ ld(AT, tmp1, 16);
    1.18 -  //    __ ld(tmp, tmp1, 24);
    1.19 -  //    __ sd(AT, tmp2, 16);
    1.20 -  //    __ sd(tmp, tmp2, 24);
    1.21 -  //    __ daddi(tmp1, tmp1, 32);
    1.22 -  //    __ daddi(tmp2, tmp2, 32);
    1.23 -  //    __ daddi(tmp3, tmp3, -16);
    1.24 -  //    __ daddi(AT, tmp3, -16);
    1.25 -  //    __ bgez(AT, l_x);
    1.26 -  //    __ delayed()->nop();
    1.27 -  //
    1.28 +
    1.29    address generate_disjoint_short_copy(bool aligned, const char * name) {
    1.30      StubCodeMark mark(this, "StubRoutines", name);
    1.31      __ align(CodeEntryAlignment);
    1.32 @@ -1000,23 +980,28 @@
    1.33      Register tmp2 = T1;
    1.34      Register tmp3 = T3;
    1.35      Register tmp4 = T8;
    1.36 +    Register tmp5 = T9;
    1.37 +    Register tmp6 = T2;
    1.38  
    1.39      address start = __ pc();
    1.40  
    1.41      __ push(tmp1);
    1.42      __ push(tmp2);
    1.43      __ push(tmp3);
    1.44 -    __ push(tmp4);
    1.45      __ move(tmp1, A0);
    1.46      __ move(tmp2, A1);
    1.47      __ move(tmp3, A2);
    1.48  
    1.49 -    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
    1.50 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14;
    1.51      Label l_debug;
    1.52      // don't try anything fancy if arrays don't have many elements
    1.53 -    __ daddi(AT, tmp3, -9);
    1.54 -    __ blez(AT, l_1);
    1.55 +    __ daddi(AT, tmp3, -23);
    1.56 +    __ blez(AT, l_14);
    1.57      __ delayed()->nop();
    1.58 +    // move push here
    1.59 +    __ push(tmp4);
    1.60 +    __ push(tmp5);
    1.61 +    __ push(tmp6);
    1.62  
    1.63      if (!aligned) {
    1.64        __ xorr(AT, A0, A1);
    1.65 @@ -1060,115 +1045,143 @@
    1.66        __ lw(AT, tmp1, 0);
    1.67        __ daddi(tmp3, tmp3, -2);
    1.68        __ sw(AT, tmp2, 0);
    1.69 -      { // FasterArrayCopy
    1.70 -        __ daddi(tmp1, tmp1, 4);
    1.71 -        __ daddi(tmp2, tmp2, 4);
    1.72 -      }
    1.73 -    }//end of if
    1.74 +      __ daddi(tmp1, tmp1, 4);
    1.75 +      __ daddi(tmp2, tmp2, 4);
    1.76 +    }// end of if (!aligned)
    1.77  
    1.78 -      __ bind(l_7);
    1.79 +    __ bind(l_7);
    1.80 +    // At this time the position of both, from and to, are at least 8 byte aligned.
    1.81 +    // Copy 8 elemnets at a time.
    1.82 +    // Align to 16 bytes, but only if both from and to have same alignment mod 8.
    1.83 +    __ xorr(AT, tmp1, tmp2);
    1.84 +    __ andi(AT, AT, 15);
    1.85 +    __ bne(AT, R0, l_9);
    1.86 +    __ delayed()->nop();
    1.87  
    1.88 -      // At this time the position of both, from and to, are at least 8 byte aligned.
    1.89 +    // Copy 4-element word if necessary to align to 16 bytes,
    1.90 +    __ andi(AT, tmp1, 15);
    1.91 +    __ beq(AT, R0, l_10);
    1.92 +    __ delayed()->nop();
    1.93  
    1.94 -      // Copy 8 elemnets at a time.
    1.95 -      // Align to 16 bytes, but only if both from and to have same alignment mod 8.
    1.96 -      __ xorr(AT, tmp1, tmp2);
    1.97 -      __ andi(AT, AT, 15);
    1.98 -      __ bne(AT, R0, l_9);
    1.99 -      __ delayed()->nop();
   1.100 +    __ ld(AT, tmp1, 0);
   1.101 +    __ daddi(tmp3, tmp3, -4);
   1.102 +    __ sd(AT, tmp2, 0);
   1.103 +    __ daddi(tmp1, tmp1, 8);
   1.104 +    __ daddi(tmp2, tmp2, 8);
   1.105  
   1.106 -      // Copy 4-element word if necessary to align to 16 bytes,
   1.107 -      __ andi(AT, tmp1, 15);
   1.108 -      __ beq(AT, R0, l_10);
   1.109 -      __ delayed()->nop();
   1.110 -
   1.111 -      __ ld(AT, tmp1, 0);
   1.112 -      __ daddi(tmp3, tmp3, -4);
   1.113 -      __ sd(AT, tmp2, 0);
   1.114 -      { // FasterArrayCopy
   1.115 -        __ daddi(tmp1, tmp1, 8);
   1.116 -        __ daddi(tmp2, tmp2, 8);
   1.117 -      }
   1.118 -
   1.119 -      __ bind(l_10);
   1.120 +    __ bind(l_10);
   1.121  
   1.122      // Copy 8 elements at a time; either the loads or the stores can 
   1.123      // be unalligned if aligned == false
   1.124  
   1.125      { // FasterArrayCopy
   1.126 -      __ daddi(AT, tmp3, -15);
   1.127 -      __ blez(AT, l_9);
   1.128 -      __ delayed()->nop();
   1.129 -
   1.130        __ bind(l_11);
   1.131        // For loongson the 128-bit memory access instruction is gslq/gssq
   1.132        __ gslq(AT, tmp4, tmp1, 0);
   1.133 -      __ daddi(tmp1, tmp1, 16);
   1.134 -      __ daddi(tmp3, tmp3, -8);
   1.135 -      __ daddi(tmp2, tmp2, 16);
   1.136 -      __ gssq(AT, tmp4, tmp2, -16);
   1.137 -      __ daddi(AT, tmp3, -8);
   1.138 +      __ gslq(tmp5, tmp6, tmp1, 16);
   1.139 +      __ daddi(tmp1, tmp1, 32);
   1.140 +      __ daddi(tmp2, tmp2, 32);
   1.141 +      __ gssq(AT, tmp4, tmp2, -32);
   1.142 +      __ gssq(tmp5, tmp6, tmp2, -16);
   1.143 +      __ daddi(tmp3, tmp3, -16);
   1.144 +      __ daddi(AT, tmp3, -16);
   1.145        __ bgez(AT, l_11);
   1.146        __ delayed()->nop();
   1.147      }
   1.148      __ bind(l_9);
   1.149 +
   1.150      // Copy 4 elements at a time; either the loads or the stores can
   1.151      // be unaligned if aligned == false.
   1.152 -
   1.153      { // FasterArrayCopy
   1.154 -      __ daddi(AT, tmp3, -3);
   1.155 -      __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
   1.156 +      __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16
   1.157 +      __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain
   1.158        __ delayed()->nop();
   1.159  
   1.160        __ bind(l_8);
   1.161        __ ld(AT, tmp1, 0);
   1.162 +      __ ld(tmp4, tmp1, 8);
   1.163 +      __ ld(tmp5, tmp1, 16);
   1.164 +      __ ld(tmp6, tmp1, 24);
   1.165        __ sd(AT, tmp2, 0);
   1.166 -      __ daddi(tmp1, tmp1, 8);
   1.167 -      __ daddi(tmp2, tmp2, 8);
   1.168 -      __ daddi(tmp3, tmp3, -4);
   1.169 -      __ daddi(AT, tmp3, -4);
   1.170 +      __ sd(tmp4, tmp2, 8);
   1.171 +      __ sd(tmp5, tmp2,16);
   1.172 +      __ daddi(tmp1, tmp1, 32);
   1.173 +      __ daddi(tmp2, tmp2, 32);
   1.174 +      __ daddi(tmp3, tmp3, -16);
   1.175 +      __ daddi(AT, tmp3, -16);
   1.176        __ bgez(AT, l_8);
   1.177 -      __ delayed()->nop();
   1.178 +      __ sd(tmp6, tmp2, -8);
   1.179      }
   1.180      __ bind(l_6);
   1.181  
   1.182      // copy 2 element at a time
   1.183      { // FasterArrayCopy
   1.184 -      __ daddi(AT, tmp3, -1);
   1.185 -      __ blez(AT, l_1);
   1.186 +      __ daddi(AT, tmp3, -7);
   1.187 +      __ blez(AT, l_4);
   1.188        __ delayed()->nop();
   1.189  
   1.190        __ bind(l_3);
   1.191        __ lw(AT, tmp1, 0);
   1.192 +      __ lw(tmp4, tmp1, 4);
   1.193 +      __ lw(tmp5, tmp1, 8);
   1.194 +      __ lw(tmp6, tmp1, 12);
   1.195        __ sw(AT, tmp2, 0);
   1.196 -      __ daddi(tmp1, tmp1, 4);
   1.197 -      __ daddi(tmp2, tmp2, 4);
   1.198 -      __ daddi(tmp3, tmp3, -2);
   1.199 -      __ daddi(AT, tmp3, -2);
   1.200 +      __ sw(tmp4, tmp2, 4);
   1.201 +      __ sw(tmp5, tmp2, 8);
   1.202 +      __ daddi(tmp1, tmp1, 16);
   1.203 +      __ daddi(tmp2, tmp2, 16);
   1.204 +      __ daddi(tmp3, tmp3, -8);
   1.205 +      __ daddi(AT, tmp3, -8);
   1.206        __ bgez(AT, l_3);
   1.207 -      __ delayed()->nop();
   1.208 -
   1.209 +      __ sw(tmp6, tmp2, -4);
   1.210      }
   1.211  
   1.212 +    __ bind(l_1);
   1.213      // do single element copy (8 bit), can this happen?
   1.214 -    __ bind(l_1);
   1.215 -    __ beq(R0, tmp3, l_4);
   1.216 -    __ delayed()->nop();
   1.217 -
   1.218      { // FasterArrayCopy
   1.219 +      __ daddi(AT, tmp3, -3);
   1.220 +      __ blez(AT, l_4);
   1.221 +      __ delayed()->nop();
   1.222  
   1.223        __ bind(l_5);
   1.224        __ lhu(AT, tmp1, 0);
   1.225 -      __ daddi(tmp3, tmp3, -1);
   1.226 +      __ lhu(tmp4, tmp1, 2);
   1.227 +      __ lhu(tmp5, tmp1, 4);
   1.228 +      __ lhu(tmp6, tmp1, 6);
   1.229 +      __ sh(AT, tmp2, 0);
   1.230 +      __ sh(tmp4, tmp2, 2);
   1.231 +      __ sh(tmp5, tmp2, 4);
   1.232 +      __ daddi(tmp1, tmp1, 8);
   1.233 +      __ daddi(tmp2, tmp2, 8);
   1.234 +      __ daddi(tmp3, tmp3, -4);
   1.235 +      __ daddi(AT, tmp3, -4);
   1.236 +      __ bgez(AT, l_5);
   1.237 +      __ sh(tmp6, tmp2, -2);
   1.238 +    }
   1.239 +    // single element
   1.240 +    __ bind(l_4);
   1.241 +
   1.242 +    __ pop(tmp6);
   1.243 +    __ pop(tmp5);
   1.244 +    __ pop(tmp4);
   1.245 +
   1.246 +    __ bind(l_14);
   1.247 +    { // FasterArrayCopy
   1.248 +      __ beq(R0, tmp3, l_13);
   1.249 +      __ delayed()->nop();
   1.250 +
   1.251 +      __ bind(l_12);
   1.252 +      __ lhu(AT, tmp1, 0);
   1.253        __ sh(AT, tmp2, 0);
   1.254        __ daddi(tmp1, tmp1, 2);
   1.255        __ daddi(tmp2, tmp2, 2);
   1.256 +      __ daddi(tmp3, tmp3, -1);
   1.257        __ daddi(AT, tmp3, -1);
   1.258 -      __ bgez(AT, l_5);
   1.259 +      __ bgez(AT, l_12);
   1.260        __ delayed()->nop();
   1.261      }
   1.262 -    __ bind(l_4);
   1.263 -    __ pop(tmp4);
   1.264 +
   1.265 +    __ bind(l_13);
   1.266      __ pop(tmp3);
   1.267      __ pop(tmp2);
   1.268      __ pop(tmp1);

mercurial