src/cpu/mips/vm/stubGenerator_mips_64.cpp

changeset 13
bc227c49eaae
parent 8
cf5765c81f87
child 32
3b95e10c12fa
     1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Mon May 30 01:30:23 2016 -0400
     1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Mon May 30 02:01:38 2016 -0400
     1.3 @@ -849,85 +849,208 @@
     1.4      return start;
     1.5    }
     1.6  
     1.7 -  // Arguments:
     1.8 -  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
     1.9 -  //             ignored
    1.10 -  //   name    - stub name string
    1.11 +  // Generate stub for disjoint short copy.  If "aligned" is true, the
    1.12 +  // "from" and "to" addresses are assumed to be heapword aligned.
    1.13    //
    1.14 -  // Inputs:
    1.15 -  //   c_rarg0   - source array address
    1.16 -  //   c_rarg1   - destination array address
    1.17 -  //   c_rarg2   - element count, treated as ssize_t, can be zero
    1.18 +  // Arguments for generated stub:
    1.19 +  //      from:  A0
    1.20 +  //      to:    A1
    1.21 +  //  elm.count: A2 treated as signed
    1.22 +  //  one element: 2 bytes
    1.23    //
    1.24 -  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
    1.25 -  // let the hardware handle it.  The two or four words within dwords
    1.26 -  // or qwords that span cache line boundaries will still be loaded
    1.27 -  // and stored atomically.
    1.28 +  // Strategy for aligned==true:
    1.29    //
    1.30 -  // Side Effects:
    1.31 -  //   disjoint_short_copy_entry is set to the no-overlap entry point
    1.32 -  //   used by generate_conjoint_short_copy().
    1.33 +  //  If length <= 9:
    1.34 +  //     1. copy 1 elements at a time (l_5)
    1.35    //
    1.36 -  address generate_disjoint_short_copy(bool aligned, const char *name) {
    1.37 -		Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
    1.38 -		StubCodeMark mark(this, "StubRoutines", name);
    1.39 -		__ align(CodeEntryAlignment);
    1.40 -		address start = __ pc();
    1.41 +  //  If length > 9:
    1.42 +  //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
    1.43 +  //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
    1.44 +  //     3. copy last element if one was left in step 2. (l_1)
    1.45 +  //
    1.46 +  //
    1.47 +  // Strategy for aligned==false:
    1.48 +  //
    1.49 +  //  If length <= 9: same as aligned==true case
    1.50 +  //
    1.51 +  //  If length > 9:
    1.52 +  //     1. continue with step 7. if the alignment of from and to mod 4
    1.53 +  //        is different.
    1.54 +  //     2. align from and to to 4 bytes by copying 1 element if necessary
    1.55 +  //     3. at l_2 from and to are 4 byte aligned; continue with
    1.56 +  //        6. if they cannot be aligned to 8 bytes because they have
    1.57 +  //        got different alignment mod 8.
    1.58 +  //     4. at this point we know that both, from and to, have the same
    1.59 +  //        alignment mod 8, now copy one element if necessary to get
    1.60 +  //        8 byte alignment of from and to.
    1.61 +  //     5. copy 4 elements at a time until less than 4 elements are
    1.62 +  //        left; depending on step 3. all load/stores are aligned.
    1.63 +  //     6. copy 2 elements at a time until less than 2 elements are
    1.64 +  //        left. (l_6)
    1.65 +  //     7. copy 1 element at a time. (l_5)
    1.66 +  //     8. copy last element if one was left in step 6. (l_1)
    1.67 +  //
    1.68 +  //  TODO:
    1.69 +  //
    1.70 +  //  1. use loongson 128-bit load/store
    1.71 +  //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
    1.72 +  //    __ bind(l_x);
    1.73 +  //    __ ld(AT, tmp1, 0);
    1.74 +  //    __ ld(tmp, tmp1, 8);
    1.75 +  //    __ sd(AT, tmp2, 0);
    1.76 +  //    __ sd(tmp, tmp2, 8);
    1.77 +  //    __ ld(AT, tmp1, 16);
    1.78 +  //    __ ld(tmp, tmp1, 24);
    1.79 +  //    __ sd(AT, tmp2, 16);
    1.80 +  //    __ sd(tmp, tmp2, 24);
    1.81 +  //    __ daddi(tmp1, tmp1, 32);
    1.82 +  //    __ daddi(tmp2, tmp2, 32);
    1.83 +  //    __ daddi(tmp3, tmp3, -16);
    1.84 +  //    __ daddi(AT, tmp3, -16);
    1.85 +  //    __ bgez(AT, l_x);
    1.86 +  //    __ delayed()->nop();
    1.87 +  //
    1.88 +  address generate_disjoint_short_copy(bool aligned, const char * name) {
    1.89 +    StubCodeMark mark(this, "StubRoutines", name);
    1.90 +    __ align(CodeEntryAlignment);
    1.91  
    1.92 -		__ push(T3);	
    1.93 -		__ push(T0);	
    1.94 -		__ push(T1);	
    1.95 -		__ push(T8);	
    1.96 -		__ move(T1, A2);  
    1.97 -		__ move(T3, A0); 
    1.98 -		__ move(T0, A1);
    1.99 +    Register tmp1 = T0;
   1.100 +    Register tmp2 = T1;
   1.101 +    Register tmp3 = T3;
   1.102  
   1.103 -		if (!aligned) {
   1.104 -			__ beq(T1, R0, l_5);
   1.105 -			__ delayed()->nop(); 
   1.106 -			// align source address at dword address boundary
   1.107 -			__ move(T8, T3); // original from
   1.108 -			__ andi(T8, T8, 3); // either 0 or 2
   1.109 -			__ beq(T8, R0, l_1); // no prefix
   1.110 -			__ delayed()->nop();
   1.111 -			// copy prefix
   1.112 -			__ lh(AT, T3, 0);
   1.113 -			__ sh(AT, T0, 0); 
   1.114 -			__ add(T3, T3, T8); 
   1.115 -			__ add(T0, T0, T8);
   1.116 -			__ addi(T1, T1, -1); 
   1.117 -			__ bind(l_1);
   1.118 -		}
   1.119 -		__ move(T8, T1);            // word count less prefix
   1.120 -		__ sra(T1, T1, 1); 
   1.121 -		__ beq(T1, R0, l_4); 
   1.122 -		__ delayed()->nop(); 
   1.123 -    // copy aligned dwords
   1.124 -		__ bind(l_2);
   1.125 -		__ align(16);
   1.126 -		__ bind(l_3);
   1.127 -		__ lw(AT, T3, 0);   
   1.128 -		__ sw(AT, T0, 0 ); 
   1.129 -		__ addi(T3, T3, 4); 
   1.130 -		__ addi(T0, T0, 4); 
   1.131 -		__ addi(T1, T1, -1); 
   1.132 -		__ bne(T1, R0, l_3); 
   1.133 -		__ delayed()->nop(); 
   1.134 -		__ bind(l_4);
   1.135 -		__ andi(T8, T8, 1); 
   1.136 -		__ beq(T8, R0, l_5);  
   1.137 -		__ delayed()->nop(); 
   1.138 -		// copy suffix
   1.139 -		__ lh(AT, T3, 0); 
   1.140 -		__ sh(AT, T0, 0); 
   1.141 -		__ bind(l_5);
   1.142 -		__ pop(T8);	
   1.143 -		__ pop(T1);	
   1.144 -		__ pop(T0);	
   1.145 -		__ pop(T3);	
   1.146 -		__ jr(RA); 
   1.147 -		__ delayed()->nop();  
   1.148 -		return start;
   1.149 +    address start = __ pc();
   1.150 +
   1.151 +    __ push(tmp1);
   1.152 +    __ push(tmp2);
   1.153 +    __ push(tmp3);
   1.154 +    __ move(tmp1, A0);
   1.155 +    __ move(tmp2, A1);
   1.156 +    __ move(tmp3, A2);
   1.157 +
   1.158 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
   1.159 +    Label l_debug;
   1.160 +    // don't try anything fancy if arrays don't have many elements
   1.161 +    __ daddi(AT, tmp3, -9);
   1.162 +    __ blez(AT, l_1);
   1.163 +    __ delayed()->nop();
   1.164 +
   1.165 +    if (!aligned) {
   1.166 +      __ xorr(AT, A0, A1);
   1.167 +      __ andi(AT, AT, 1);
   1.168 +      __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
   1.169 +      __ delayed()->nop();
   1.170 +
   1.171 +      __ xorr(AT, A0, A1);
   1.172 +      __ andi(AT, AT, 3);
   1.173 +      __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
   1.174 +      __ delayed()->nop();
   1.175 +
   1.176 +      // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   1.177 +
   1.178 +      // Copy 1 element if necessary to align to 4 bytes.
   1.179 +      __ andi(AT, A0, 3);
   1.180 +      __ beq(AT, R0, l_2);
   1.181 +      __ delayed()->nop();
   1.182 +
   1.183 +      __ lhu(AT, tmp1, 0);
   1.184 +      __ daddi(tmp1, tmp1, 2);
   1.185 +      __ sh(AT, tmp2, 0);
   1.186 +      __ daddi(tmp2, tmp2, 2);
   1.187 +      __ daddi(tmp3, tmp3, -1);
   1.188 +      __ bind(l_2);
   1.189 +
   1.190 +      // At this point the positions of both, from and to, are at least 4 byte aligned.
   1.191 +
   1.192 +      // Copy 4 elements at a time.
   1.193 +      // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   1.194 +      __ xorr(AT, tmp1, tmp2);
   1.195 +      __ andi(AT, AT, 7);
   1.196 +      __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   1.197 +      __ delayed()->nop();
   1.198 +
   1.199 +      // Copy a 2-element word if necessary to align to 8 bytes.
   1.200 +      __ andi(AT, tmp1, 7);
   1.201 +      __ beq(AT, R0, l_7);
   1.202 +      __ delayed()->nop();
   1.203 +
   1.204 +      __ lw(AT, tmp1, 0);
   1.205 +      __ daddi(tmp3, tmp3, -2);
   1.206 +      __ sw(AT, tmp2, 0);
   1.207 +      { // FasterArrayCopy
   1.208 +        __ daddi(tmp1, tmp1, 4);
   1.209 +        __ daddi(tmp2, tmp2, 4);
   1.210 +      }
   1.211 +    }
   1.212 +
   1.213 +    __ bind(l_7);
   1.214 +
   1.215 +    // Copy 4 elements at a time; either the loads or the stores can
   1.216 +    // be unaligned if aligned == false.
   1.217 +
   1.218 +    { // FasterArrayCopy
   1.219 +      __ daddi(AT, tmp3, -15);
   1.220 +      __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
   1.221 +      __ delayed()->nop();
   1.222 +
   1.223 +      __ bind(l_8);
   1.224 +      // For Loongson, there is 128-bit memory access. TODO
   1.225 +      __ ld(AT, tmp1, 0);
   1.226 +      __ sd(AT, tmp2, 0);
   1.227 +      __ daddi(tmp1, tmp1, 8);
   1.228 +      __ daddi(tmp2, tmp2, 8);
   1.229 +      __ daddi(tmp3, tmp3, -4);
   1.230 +      __ daddi(AT, tmp3, -4);
   1.231 +      __ bgez(AT, l_8);
   1.232 +      __ delayed()->nop();
   1.233 +    }
   1.234 +    __ bind(l_6);
   1.235 +
   1.236 +    // copy 2 element at a time
   1.237 +    { // FasterArrayCopy
   1.238 +      __ daddi(AT, tmp3, -1);
   1.239 +      __ blez(AT, l_1);
   1.240 +      __ delayed()->nop();
   1.241 +
   1.242 +      __ bind(l_3);
   1.243 +      __ lw(AT, tmp1, 0);
   1.244 +      __ sw(AT, tmp2, 0);
   1.245 +      __ daddi(tmp1, tmp1, 4);
   1.246 +      __ daddi(tmp2, tmp2, 4);
   1.247 +      __ daddi(tmp3, tmp3, -2);
   1.248 +      __ daddi(AT, tmp3, -2);
   1.249 +      __ bgez(AT, l_3);
   1.250 +      __ delayed()->nop();
   1.251 +
   1.252 +    }
   1.253 +
   1.254 +    // do single element copy (8 bit), can this happen?
   1.255 +    __ bind(l_1);
   1.256 +    __ beq(R0, tmp3, l_4);
   1.257 +    __ delayed()->nop();
   1.258 +
   1.259 +    { // FasterArrayCopy
   1.260 +
   1.261 +      __ bind(l_5);
   1.262 +      __ lhu(AT, tmp1, 0);
   1.263 +      __ daddi(tmp3, tmp3, -1);
   1.264 +      __ sh(AT, tmp2, 0);
   1.265 +      __ daddi(tmp1, tmp1, 2);
   1.266 +      __ daddi(tmp2, tmp2, 2);
   1.267 +      __ daddi(AT, tmp3, -1);
   1.268 +      __ bgez(AT, l_5);
   1.269 +      __ delayed()->nop();
   1.270 +    }
   1.271 +    __ bind(l_4);
   1.272 +    __ pop(tmp3);
   1.273 +    __ pop(tmp2);
   1.274 +    __ pop(tmp1);
   1.275 +
   1.276 +    __ jr(RA);
   1.277 +    __ delayed()->nop();
   1.278 +
   1.279 +    __ bind(l_debug);
   1.280 +    __ stop("generate_disjoint_short_copy should not reach here");
   1.281 +    return start;
   1.282    }
   1.283  
   1.284    // Arguments:

mercurial