1.1 --- a/src/share/vm/opto/superword.cpp Thu Jun 14 14:59:52 2012 -0700 1.2 +++ b/src/share/vm/opto/superword.cpp Fri Jun 15 01:25:19 2012 -0700 1.3 @@ -1,5 +1,5 @@ 1.4 /* 1.5 - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 1.6 + * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved. 1.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.8 * 1.9 * This code is free software; you can redistribute it and/or modify it 1.10 @@ -67,6 +67,10 @@ 1.11 1.12 //------------------------------transform_loop--------------------------- 1.13 void SuperWord::transform_loop(IdealLoopTree* lpt) { 1.14 + assert(UseSuperWord, "should be"); 1.15 + // Do vectors exist on this architecture? 1.16 + if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; 1.17 + 1.18 assert(lpt->_head->is_CountedLoop(), "must be"); 1.19 CountedLoopNode *cl = lpt->_head->as_CountedLoop(); 1.20 1.21 @@ -89,15 +93,12 @@ 1.22 Node *pre_opaq1 = pre_end->limit(); 1.23 if (pre_opaq1->Opcode() != Op_Opaque1) return; 1.24 1.25 - // Do vectors exist on this architecture? 1.26 - if (vector_width_in_bytes() == 0) return; 1.27 - 1.28 init(); // initialize data structures 1.29 1.30 set_lpt(lpt); 1.31 set_lp(cl); 1.32 1.33 - // For now, define one block which is the entire loop body 1.34 + // For now, define one block which is the entire loop body 1.35 set_bb(cl); 1.36 1.37 assert(_packset.length() == 0, "packset must be empty"); 1.38 @@ -177,7 +178,7 @@ 1.39 Node_List memops; 1.40 for (int i = 0; i < _block.length(); i++) { 1.41 Node* n = _block.at(i); 1.42 - if (n->is_Mem() && in_bb(n) && 1.43 + if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) && 1.44 is_java_primitive(n->as_Mem()->memory_type())) { 1.45 int align = memory_alignment(n->as_Mem(), 0); 1.46 if (align != bottom_align) { 1.47 @@ -185,54 +186,130 @@ 1.48 } 1.49 } 1.50 } 1.51 - if (memops.size() == 0) return; 1.52 1.53 - // Find a memory reference to align to. The pre-loop trip count 1.54 - // is modified to align this reference to a vector-aligned address 1.55 - find_align_to_ref(memops); 1.56 - if (align_to_ref() == NULL) return; 1.57 + Node_List align_to_refs; 1.58 + int best_iv_adjustment = 0; 1.59 + MemNode* best_align_to_mem_ref = NULL; 1.60 1.61 - SWPointer align_to_ref_p(align_to_ref(), this); 1.62 - int offset = align_to_ref_p.offset_in_bytes(); 1.63 - int scale = align_to_ref_p.scale_in_bytes(); 1.64 - int vw = vector_width_in_bytes(); 1.65 - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.66 - int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; 1.67 + while (memops.size() != 0) { 1.68 + // Find a memory reference to align to. 1.69 + MemNode* mem_ref = find_align_to_ref(memops); 1.70 + if (mem_ref == NULL) break; 1.71 + align_to_refs.push(mem_ref); 1.72 + int iv_adjustment = get_iv_adjustment(mem_ref); 1.73 1.74 -#ifndef PRODUCT 1.75 - if (TraceSuperWord) 1.76 - tty->print_cr("\noffset = %d iv_adjustment = %d elt_align = %d scale = %d iv_stride = %d", 1.77 - offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride()); 1.78 -#endif 1.79 + if (best_align_to_mem_ref == NULL) { 1.80 + // Set memory reference which is the best from all memory operations 1.81 + // to be used for alignment. The pre-loop trip count is modified to align 1.82 + // this reference to a vector-aligned address. 1.83 + best_align_to_mem_ref = mem_ref; 1.84 + best_iv_adjustment = iv_adjustment; 1.85 + } 1.86 1.87 - // Set alignment relative to "align_to_ref" 1.88 - for (int i = memops.size() - 1; i >= 0; i--) { 1.89 - MemNode* s = memops.at(i)->as_Mem(); 1.90 - SWPointer p2(s, this); 1.91 - if (p2.comparable(align_to_ref_p)) { 1.92 - int align = memory_alignment(s, iv_adjustment); 1.93 - set_alignment(s, align); 1.94 - } else { 1.95 - memops.remove(i); 1.96 - } 1.97 - } 1.98 - 1.99 - // Create initial pack pairs of memory operations 1.100 - for (uint i = 0; i < memops.size(); i++) { 1.101 - Node* s1 = memops.at(i); 1.102 - for (uint j = 0; j < memops.size(); j++) { 1.103 - Node* s2 = memops.at(j); 1.104 - if (s1 != s2 && are_adjacent_refs(s1, s2)) { 1.105 - int align = alignment(s1); 1.106 - if (stmts_can_pack(s1, s2, align)) { 1.107 - Node_List* pair = new Node_List(); 1.108 - pair->push(s1); 1.109 - pair->push(s2); 1.110 - _packset.append(pair); 1.111 + SWPointer align_to_ref_p(mem_ref, this); 1.112 + // Set alignment relative to "align_to_ref" for all related memory operations. 1.113 + for (int i = memops.size() - 1; i >= 0; i--) { 1.114 + MemNode* s = memops.at(i)->as_Mem(); 1.115 + if (isomorphic(s, mem_ref)) { 1.116 + SWPointer p2(s, this); 1.117 + if (p2.comparable(align_to_ref_p)) { 1.118 + int align = memory_alignment(s, iv_adjustment); 1.119 + set_alignment(s, align); 1.120 } 1.121 } 1.122 } 1.123 - } 1.124 + 1.125 + // Create initial pack pairs of memory operations for which 1.126 + // alignment is set and vectors will be aligned. 1.127 + bool create_pack = true; 1.128 + if (memory_alignment(mem_ref, best_iv_adjustment) != 0) { 1.129 + if (same_velt_type(mem_ref, best_align_to_mem_ref)) { 1.130 + // Can't allow vectorization of unaligned memory accesses with the 1.131 + // same type since it could be overlapped accesses to the same array. 1.132 + create_pack = false; 1.133 + } else { 1.134 + // Allow independent (different type) unaligned memory operations 1.135 + // if HW supports them. 1.136 + if (!Matcher::misaligned_vectors_ok()) { 1.137 + create_pack = false; 1.138 + } else { 1.139 + // Check if packs of the same memory type but 1.140 + // with a different alignment were created before. 1.141 + for (uint i = 0; i < align_to_refs.size(); i++) { 1.142 + MemNode* mr = align_to_refs.at(i)->as_Mem(); 1.143 + if (same_velt_type(mr, mem_ref) && 1.144 + memory_alignment(mr, iv_adjustment) != 0) 1.145 + create_pack = false; 1.146 + } 1.147 + } 1.148 + } 1.149 + } 1.150 + if (create_pack) { 1.151 + for (uint i = 0; i < memops.size(); i++) { 1.152 + Node* s1 = memops.at(i); 1.153 + int align = alignment(s1); 1.154 + if (align == top_align) continue; 1.155 + for (uint j = 0; j < memops.size(); j++) { 1.156 + Node* s2 = memops.at(j); 1.157 + if (alignment(s2) == top_align) continue; 1.158 + if (s1 != s2 && are_adjacent_refs(s1, s2)) { 1.159 + if (stmts_can_pack(s1, s2, align)) { 1.160 + Node_List* pair = new Node_List(); 1.161 + pair->push(s1); 1.162 + pair->push(s2); 1.163 + _packset.append(pair); 1.164 + } 1.165 + } 1.166 + } 1.167 + } 1.168 + } else { // Don't create unaligned pack 1.169 + // First, remove remaining memory ops of the same type from the list. 1.170 + for (int i = memops.size() - 1; i >= 0; i--) { 1.171 + MemNode* s = memops.at(i)->as_Mem(); 1.172 + if (same_velt_type(s, mem_ref)) { 1.173 + memops.remove(i); 1.174 + } 1.175 + } 1.176 + 1.177 + // Second, remove already constructed packs of the same type. 1.178 + for (int i = _packset.length() - 1; i >= 0; i--) { 1.179 + Node_List* p = _packset.at(i); 1.180 + MemNode* s = p->at(0)->as_Mem(); 1.181 + if (same_velt_type(s, mem_ref)) { 1.182 + remove_pack_at(i); 1.183 + } 1.184 + } 1.185 + 1.186 + // If needed find the best memory reference for loop alignment again. 1.187 + if (same_velt_type(mem_ref, best_align_to_mem_ref)) { 1.188 + // Put memory ops from remaining packs back on memops list for 1.189 + // the best alignment search. 1.190 + uint orig_msize = memops.size(); 1.191 + for (int i = 0; i < _packset.length(); i++) { 1.192 + Node_List* p = _packset.at(i); 1.193 + MemNode* s = p->at(0)->as_Mem(); 1.194 + assert(!same_velt_type(s, mem_ref), "sanity"); 1.195 + memops.push(s); 1.196 + } 1.197 + MemNode* best_align_to_mem_ref = find_align_to_ref(memops); 1.198 + if (best_align_to_mem_ref == NULL) break; 1.199 + best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref); 1.200 + // Restore list. 1.201 + while (memops.size() > orig_msize) 1.202 + (void)memops.pop(); 1.203 + } 1.204 + } // unaligned memory accesses 1.205 + 1.206 + // Remove used mem nodes. 1.207 + for (int i = memops.size() - 1; i >= 0; i--) { 1.208 + MemNode* m = memops.at(i)->as_Mem(); 1.209 + if (alignment(m) != top_align) { 1.210 + memops.remove(i); 1.211 + } 1.212 + } 1.213 + 1.214 + } // while (memops.size() != 0 1.215 + set_align_to_ref(best_align_to_mem_ref); 1.216 1.217 #ifndef PRODUCT 1.218 if (TraceSuperWord) { 1.219 @@ -246,7 +323,7 @@ 1.220 // Find a memory reference to align the loop induction variable to. 1.221 // Looks first at stores then at loads, looking for a memory reference 1.222 // with the largest number of references similar to it. 1.223 -void SuperWord::find_align_to_ref(Node_List &memops) { 1.224 +MemNode* SuperWord::find_align_to_ref(Node_List &memops) { 1.225 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0); 1.226 1.227 // Count number of comparable memory ops 1.228 @@ -270,20 +347,28 @@ 1.229 } 1.230 } 1.231 1.232 - // Find Store (or Load) with the greatest number of "comparable" references 1.233 + // Find Store (or Load) with the greatest number of "comparable" references, 1.234 + // biggest vector size, smallest data size and smallest iv offset. 1.235 int max_ct = 0; 1.236 + int max_vw = 0; 1.237 int max_idx = -1; 1.238 int min_size = max_jint; 1.239 int min_iv_offset = max_jint; 1.240 for (uint j = 0; j < memops.size(); j++) { 1.241 MemNode* s = memops.at(j)->as_Mem(); 1.242 if (s->is_Store()) { 1.243 + int vw = vector_width_in_bytes(velt_basic_type(s)); 1.244 + assert(vw > 1, "sanity"); 1.245 SWPointer p(s, this); 1.246 - if (cmp_ct.at(j) > max_ct || 1.247 - cmp_ct.at(j) == max_ct && (data_size(s) < min_size || 1.248 - data_size(s) == min_size && 1.249 - p.offset_in_bytes() < min_iv_offset)) { 1.250 + if (cmp_ct.at(j) > max_ct || 1.251 + cmp_ct.at(j) == max_ct && 1.252 + (vw > max_vw || 1.253 + vw == max_vw && 1.254 + (data_size(s) < min_size || 1.255 + data_size(s) == min_size && 1.256 + (p.offset_in_bytes() < min_iv_offset)))) { 1.257 max_ct = cmp_ct.at(j); 1.258 + max_vw = vw; 1.259 max_idx = j; 1.260 min_size = data_size(s); 1.261 min_iv_offset = p.offset_in_bytes(); 1.262 @@ -295,12 +380,18 @@ 1.263 for (uint j = 0; j < memops.size(); j++) { 1.264 MemNode* s = memops.at(j)->as_Mem(); 1.265 if (s->is_Load()) { 1.266 + int vw = vector_width_in_bytes(velt_basic_type(s)); 1.267 + assert(vw > 1, "sanity"); 1.268 SWPointer p(s, this); 1.269 - if (cmp_ct.at(j) > max_ct || 1.270 - cmp_ct.at(j) == max_ct && (data_size(s) < min_size || 1.271 - data_size(s) == min_size && 1.272 - p.offset_in_bytes() < min_iv_offset)) { 1.273 + if (cmp_ct.at(j) > max_ct || 1.274 + cmp_ct.at(j) == max_ct && 1.275 + (vw > max_vw || 1.276 + vw == max_vw && 1.277 + (data_size(s) < min_size || 1.278 + data_size(s) == min_size && 1.279 + (p.offset_in_bytes() < min_iv_offset)))) { 1.280 max_ct = cmp_ct.at(j); 1.281 + max_vw = vw; 1.282 max_idx = j; 1.283 min_size = data_size(s); 1.284 min_iv_offset = p.offset_in_bytes(); 1.285 @@ -309,10 +400,7 @@ 1.286 } 1.287 } 1.288 1.289 - if (max_ct > 0) 1.290 - set_align_to_ref(memops.at(max_idx)->as_Mem()); 1.291 - 1.292 -#ifndef PRODUCT 1.293 +#ifdef ASSERT 1.294 if (TraceSuperWord && Verbose) { 1.295 tty->print_cr("\nVector memops after find_align_to_refs"); 1.296 for (uint i = 0; i < memops.size(); i++) { 1.297 @@ -321,6 +409,17 @@ 1.298 } 1.299 } 1.300 #endif 1.301 + 1.302 + if (max_ct > 0) { 1.303 +#ifdef ASSERT 1.304 + if (TraceSuperWord) { 1.305 + tty->print("\nVector align to node: "); 1.306 + memops.at(max_idx)->as_Mem()->dump(); 1.307 + } 1.308 +#endif 1.309 + return memops.at(max_idx)->as_Mem(); 1.310 + } 1.311 + return NULL; 1.312 } 1.313 1.314 //------------------------------ref_is_alignable--------------------------- 1.315 @@ -341,7 +440,9 @@ 1.316 1.317 // If initial offset from start of object is computable, 1.318 // compute alignment within the vector. 1.319 - int vw = vector_width_in_bytes(); 1.320 + BasicType bt = velt_basic_type(p.mem()); 1.321 + int vw = vector_width_in_bytes(bt); 1.322 + assert(vw > 1, "sanity"); 1.323 if (vw % span == 0) { 1.324 Node* init_nd = pre_end->init_trip(); 1.325 if (init_nd->is_Con() && p.invar() == NULL) { 1.326 @@ -361,6 +462,26 @@ 1.327 return false; 1.328 } 1.329 1.330 +//---------------------------get_iv_adjustment--------------------------- 1.331 +// Calculate loop's iv adjustment for this memory ops. 1.332 +int SuperWord::get_iv_adjustment(MemNode* mem_ref) { 1.333 + SWPointer align_to_ref_p(mem_ref, this); 1.334 + int offset = align_to_ref_p.offset_in_bytes(); 1.335 + int scale = align_to_ref_p.scale_in_bytes(); 1.336 + BasicType bt = velt_basic_type(mem_ref); 1.337 + int vw = vector_width_in_bytes(bt); 1.338 + assert(vw > 1, "sanity"); 1.339 + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.340 + int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; 1.341 + 1.342 +#ifndef PRODUCT 1.343 + if (TraceSuperWord) 1.344 + tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", 1.345 + offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw); 1.346 +#endif 1.347 + return iv_adjustment; 1.348 +} 1.349 + 1.350 //---------------------------dependence_graph--------------------------- 1.351 // Construct dependency graph. 1.352 // Add dependence edges to load/store nodes for memory dependence 1.353 @@ -488,9 +609,13 @@ 1.354 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { 1.355 1.356 // Do not use superword for non-primitives 1.357 - if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) || 1.358 - (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type()))) 1.359 + BasicType bt1 = velt_basic_type(s1); 1.360 + BasicType bt2 = velt_basic_type(s2); 1.361 + if(!is_java_primitive(bt1) || !is_java_primitive(bt2)) 1.362 return false; 1.363 + if (Matcher::max_vector_size(bt1) < 2) { 1.364 + return false; // No vectors for this type 1.365 + } 1.366 1.367 if (isomorphic(s1, s2)) { 1.368 if (independent(s1, s2)) { 1.369 @@ -552,7 +677,7 @@ 1.370 if (s1->Opcode() != s2->Opcode()) return false; 1.371 if (s1->req() != s2->req()) return false; 1.372 if (s1->in(0) != s2->in(0)) return false; 1.373 - if (velt_type(s1) != velt_type(s2)) return false; 1.374 + if (!same_velt_type(s1, s2)) return false; 1.375 return true; 1.376 } 1.377 1.378 @@ -595,14 +720,16 @@ 1.379 //------------------------------set_alignment--------------------------- 1.380 void SuperWord::set_alignment(Node* s1, Node* s2, int align) { 1.381 set_alignment(s1, align); 1.382 - set_alignment(s2, align + data_size(s1)); 1.383 + if (align == top_align || align == bottom_align) { 1.384 + set_alignment(s2, align); 1.385 + } else { 1.386 + set_alignment(s2, align + data_size(s1)); 1.387 + } 1.388 } 1.389 1.390 //------------------------------data_size--------------------------- 1.391 int SuperWord::data_size(Node* s) { 1.392 - const Type* t = velt_type(s); 1.393 - BasicType bt = t->array_element_basic_type(); 1.394 - int bsize = type2aelembytes(bt); 1.395 + int bsize = type2aelembytes(velt_basic_type(s)); 1.396 assert(bsize != 0, "valid size"); 1.397 return bsize; 1.398 } 1.399 @@ -631,9 +758,9 @@ 1.400 //------------------------------follow_use_defs--------------------------- 1.401 // Extend the packset by visiting operand definitions of nodes in pack p 1.402 bool SuperWord::follow_use_defs(Node_List* p) { 1.403 + assert(p->size() == 2, "just checking"); 1.404 Node* s1 = p->at(0); 1.405 Node* s2 = p->at(1); 1.406 - assert(p->size() == 2, "just checking"); 1.407 assert(s1->req() == s2->req(), "just checking"); 1.408 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking"); 1.409 1.410 @@ -718,7 +845,12 @@ 1.411 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break; 1.412 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break; 1.413 if (i1 != i2) { 1.414 - return false; 1.415 + if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) { 1.416 + // Further analysis relies on operands position matching. 1.417 + u2->swap_edges(i1, i2); 1.418 + } else { 1.419 + return false; 1.420 + } 1.421 } 1.422 } while (i1 < ct); 1.423 return true; 1.424 @@ -727,7 +859,7 @@ 1.425 //------------------------------est_savings--------------------------- 1.426 // Estimate the savings from executing s1 and s2 as a pack 1.427 int SuperWord::est_savings(Node* s1, Node* s2) { 1.428 - int save = 2 - 1; // 2 operations per instruction in packed form 1.429 + int save_in = 2 - 1; // 2 operations per instruction in packed form 1.430 1.431 // inputs 1.432 for (uint i = 1; i < s1->req(); i++) { 1.433 @@ -735,17 +867,18 @@ 1.434 Node* x2 = s2->in(i); 1.435 if (x1 != x2) { 1.436 if (are_adjacent_refs(x1, x2)) { 1.437 - save += adjacent_profit(x1, x2); 1.438 + save_in += adjacent_profit(x1, x2); 1.439 } else if (!in_packset(x1, x2)) { 1.440 - save -= pack_cost(2); 1.441 + save_in -= pack_cost(2); 1.442 } else { 1.443 - save += unpack_cost(2); 1.444 + save_in += unpack_cost(2); 1.445 } 1.446 } 1.447 } 1.448 1.449 // uses of result 1.450 uint ct = 0; 1.451 + int save_use = 0; 1.452 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 1.453 Node* s1_use = s1->fast_out(i); 1.454 for (int j = 0; j < _packset.length(); j++) { 1.455 @@ -756,7 +889,7 @@ 1.456 if (p->at(p->size()-1) == s2_use) { 1.457 ct++; 1.458 if (are_adjacent_refs(s1_use, s2_use)) { 1.459 - save += adjacent_profit(s1_use, s2_use); 1.460 + save_use += adjacent_profit(s1_use, s2_use); 1.461 } 1.462 } 1.463 } 1.464 @@ -764,10 +897,10 @@ 1.465 } 1.466 } 1.467 1.468 - if (ct < s1->outcnt()) save += unpack_cost(1); 1.469 - if (ct < s2->outcnt()) save += unpack_cost(1); 1.470 + if (ct < s1->outcnt()) save_use += unpack_cost(1); 1.471 + if (ct < s2->outcnt()) save_use += unpack_cost(1); 1.472 1.473 - return save; 1.474 + return MAX2(save_in, save_use); 1.475 } 1.476 1.477 //------------------------------costs--------------------------- 1.478 @@ -778,8 +911,9 @@ 1.479 //------------------------------combine_packs--------------------------- 1.480 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last 1.481 void SuperWord::combine_packs() { 1.482 - bool changed; 1.483 - do { 1.484 + bool changed = true; 1.485 + // Combine packs regardless max vector size. 1.486 + while (changed) { 1.487 changed = false; 1.488 for (int i = 0; i < _packset.length(); i++) { 1.489 Node_List* p1 = _packset.at(i); 1.490 @@ -787,6 +921,7 @@ 1.491 for (int j = 0; j < _packset.length(); j++) { 1.492 Node_List* p2 = _packset.at(j); 1.493 if (p2 == NULL) continue; 1.494 + if (i == j) continue; 1.495 if (p1->at(p1->size()-1) == p2->at(0)) { 1.496 for (uint k = 1; k < p2->size(); k++) { 1.497 p1->push(p2->at(k)); 1.498 @@ -796,8 +931,39 @@ 1.499 } 1.500 } 1.501 } 1.502 - } while (changed); 1.503 + } 1.504 1.505 + // Split packs which have size greater then max vector size. 1.506 + for (int i = 0; i < _packset.length(); i++) { 1.507 + Node_List* p1 = _packset.at(i); 1.508 + if (p1 != NULL) { 1.509 + BasicType bt = velt_basic_type(p1->at(0)); 1.510 + uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector 1.511 + assert(is_power_of_2(max_vlen), "sanity"); 1.512 + uint psize = p1->size(); 1.513 + if (!is_power_of_2(psize)) { 1.514 + // Skip pack which can't be vector. 1.515 + // case1: for(...) { a[i] = i; } elements values are different (i+x) 1.516 + // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store 1.517 + _packset.at_put(i, NULL); 1.518 + continue; 1.519 + } 1.520 + if (psize > max_vlen) { 1.521 + Node_List* pack = new Node_List(); 1.522 + for (uint j = 0; j < psize; j++) { 1.523 + pack->push(p1->at(j)); 1.524 + if (pack->size() >= max_vlen) { 1.525 + assert(is_power_of_2(pack->size()), "sanity"); 1.526 + _packset.append(pack); 1.527 + pack = new Node_List(); 1.528 + } 1.529 + } 1.530 + _packset.at_put(i, NULL); 1.531 + } 1.532 + } 1.533 + } 1.534 + 1.535 + // Compress list. 1.536 for (int i = _packset.length() - 1; i >= 0; i--) { 1.537 Node_List* p1 = _packset.at(i); 1.538 if (p1 == NULL) { 1.539 @@ -880,8 +1046,7 @@ 1.540 // Can code be generated for pack p? 1.541 bool SuperWord::implemented(Node_List* p) { 1.542 Node* p0 = p->at(0); 1.543 - int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0)); 1.544 - return vopc > 0 && Matcher::has_match_rule(vopc); 1.545 + return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0)); 1.546 } 1.547 1.548 //------------------------------profitable--------------------------- 1.549 @@ -939,36 +1104,36 @@ 1.550 } 1.551 1.552 //-------------------------------remove_and_insert------------------- 1.553 -//remove "current" from its current position in the memory graph and insert 1.554 -//it after the appropriate insertion point (lip or uip) 1.555 +// Remove "current" from its current position in the memory graph and insert 1.556 +// it after the appropriate insertion point (lip or uip). 1.557 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, 1.558 Node *uip, Unique_Node_List &sched_before) { 1.559 Node* my_mem = current->in(MemNode::Memory); 1.560 - _igvn.rehash_node_delayed(current); 1.561 - _igvn.hash_delete(my_mem); 1.562 + bool sched_up = sched_before.member(current); 1.563 1.564 - //remove current_store from its current position in the memmory graph 1.565 + // remove current_store from its current position in the memmory graph 1.566 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1.567 Node* use = current->out(i); 1.568 if (use->is_Mem()) { 1.569 assert(use->in(MemNode::Memory) == current, "must be"); 1.570 - _igvn.rehash_node_delayed(use); 1.571 if (use == prev) { // connect prev to my_mem 1.572 - use->set_req(MemNode::Memory, my_mem); 1.573 + _igvn.replace_input_of(use, MemNode::Memory, my_mem); 1.574 + --i; //deleted this edge; rescan position 1.575 } else if (sched_before.member(use)) { 1.576 - _igvn.hash_delete(uip); 1.577 - use->set_req(MemNode::Memory, uip); 1.578 + if (!sched_up) { // Will be moved together with current 1.579 + _igvn.replace_input_of(use, MemNode::Memory, uip); 1.580 + --i; //deleted this edge; rescan position 1.581 + } 1.582 } else { 1.583 - _igvn.hash_delete(lip); 1.584 - use->set_req(MemNode::Memory, lip); 1.585 + if (sched_up) { // Will be moved together with current 1.586 + _igvn.replace_input_of(use, MemNode::Memory, lip); 1.587 + --i; //deleted this edge; rescan position 1.588 + } 1.589 } 1.590 - --i; //deleted this edge; rescan position 1.591 } 1.592 } 1.593 1.594 - bool sched_up = sched_before.member(current); 1.595 Node *insert_pt = sched_up ? uip : lip; 1.596 - _igvn.hash_delete(insert_pt); 1.597 1.598 // all uses of insert_pt's memory state should use current's instead 1.599 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) { 1.600 @@ -988,7 +1153,7 @@ 1.601 } 1.602 1.603 //connect current to insert_pt 1.604 - current->set_req(MemNode::Memory, insert_pt); 1.605 + _igvn.replace_input_of(current, MemNode::Memory, insert_pt); 1.606 } 1.607 1.608 //------------------------------co_locate_pack---------------------------------- 1.609 @@ -1025,7 +1190,7 @@ 1.610 if (use->is_Mem() && use != previous) 1.611 memops.push(use); 1.612 } 1.613 - if(current == first) break; 1.614 + if (current == first) break; 1.615 previous = current; 1.616 current = current->in(MemNode::Memory)->as_Mem(); 1.617 } 1.618 @@ -1038,27 +1203,37 @@ 1.619 Node *s2 = memops.at(j); 1.620 if (!independent(s1, s2)) { 1.621 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) { 1.622 - schedule_before_pack.push(s1); //s1 must be scheduled before 1.623 + schedule_before_pack.push(s1); // s1 must be scheduled before 1.624 Node_List* mem_pk = my_pack(s1); 1.625 if (mem_pk != NULL) { 1.626 for (uint ii = 0; ii < mem_pk->size(); ii++) { 1.627 - Node* s = mem_pk->at(ii); // follow partner 1.628 + Node* s = mem_pk->at(ii); // follow partner 1.629 if (memops.member(s) && !schedule_before_pack.member(s)) 1.630 schedule_before_pack.push(s); 1.631 } 1.632 } 1.633 + break; 1.634 } 1.635 } 1.636 } 1.637 } 1.638 } 1.639 1.640 + Node* upper_insert_pt = first->in(MemNode::Memory); 1.641 + // Following code moves loads connected to upper_insert_pt below aliased stores. 1.642 + // Collect such loads here and reconnect them back to upper_insert_pt later. 1.643 + memops.clear(); 1.644 + for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) { 1.645 + Node* use = upper_insert_pt->out(i); 1.646 + if (!use->is_Store()) 1.647 + memops.push(use); 1.648 + } 1.649 + 1.650 MemNode* lower_insert_pt = last; 1.651 - Node* upper_insert_pt = first->in(MemNode::Memory); 1.652 previous = last; //previous store in pk 1.653 current = last->in(MemNode::Memory)->as_Mem(); 1.654 1.655 - //start scheduling from "last" to "first" 1.656 + // start scheduling from "last" to "first" 1.657 while (true) { 1.658 assert(in_bb(current), "stay in block"); 1.659 assert(in_pack(previous, pk), "previous stays in pack"); 1.660 @@ -1066,16 +1241,13 @@ 1.661 1.662 if (in_pack(current, pk)) { 1.663 // Forward users of my memory state (except "previous) to my input memory state 1.664 - _igvn.hash_delete(current); 1.665 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1.666 Node* use = current->out(i); 1.667 if (use->is_Mem() && use != previous) { 1.668 assert(use->in(MemNode::Memory) == current, "must be"); 1.669 if (schedule_before_pack.member(use)) { 1.670 - _igvn.hash_delete(upper_insert_pt); 1.671 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt); 1.672 } else { 1.673 - _igvn.hash_delete(lower_insert_pt); 1.674 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt); 1.675 } 1.676 --i; // deleted this edge; rescan position 1.677 @@ -1089,6 +1261,14 @@ 1.678 if (current == first) break; 1.679 current = my_mem->as_Mem(); 1.680 } // end while 1.681 + 1.682 + // Reconnect loads back to upper_insert_pt. 1.683 + for (uint i = 0; i < memops.size(); i++) { 1.684 + Node *ld = memops.at(i); 1.685 + if (ld->in(MemNode::Memory) != upper_insert_pt) { 1.686 + _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt); 1.687 + } 1.688 + } 1.689 } else if (pk->at(0)->is_Load()) { //load 1.690 // all loads in the pack should have the same memory state. By default, 1.691 // we use the memory state of the last load. However, if any load could 1.692 @@ -1149,35 +1329,30 @@ 1.693 Node* vn = NULL; 1.694 Node* low_adr = p->at(0); 1.695 Node* first = executed_first(p); 1.696 + int opc = n->Opcode(); 1.697 if (n->is_Load()) { 1.698 - int opc = n->Opcode(); 1.699 Node* ctl = n->in(MemNode::Control); 1.700 Node* mem = first->in(MemNode::Memory); 1.701 Node* adr = low_adr->in(MemNode::Address); 1.702 const TypePtr* atyp = n->adr_type(); 1.703 - vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen); 1.704 - 1.705 + vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n)); 1.706 } else if (n->is_Store()) { 1.707 // Promote value to be stored to vector 1.708 Node* val = vector_opd(p, MemNode::ValueIn); 1.709 - 1.710 - int opc = n->Opcode(); 1.711 Node* ctl = n->in(MemNode::Control); 1.712 Node* mem = first->in(MemNode::Memory); 1.713 Node* adr = low_adr->in(MemNode::Address); 1.714 const TypePtr* atyp = n->adr_type(); 1.715 - vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); 1.716 - 1.717 + vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); 1.718 } else if (n->req() == 3) { 1.719 // Promote operands to vector 1.720 Node* in1 = vector_opd(p, 1); 1.721 Node* in2 = vector_opd(p, 2); 1.722 - vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n)); 1.723 - 1.724 + vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n)); 1.725 } else { 1.726 ShouldNotReachHere(); 1.727 } 1.728 - 1.729 + assert(vn != NULL, "sanity"); 1.730 _phase->_igvn.register_new_node_with_optimizer(vn); 1.731 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); 1.732 for (uint j = 0; j < p->size(); j++) { 1.733 @@ -1185,6 +1360,12 @@ 1.734 _igvn.replace_node(pm, vn); 1.735 } 1.736 _igvn._worklist.push(vn); 1.737 +#ifdef ASSERT 1.738 + if (TraceSuperWord) { 1.739 + tty->print("new Vector node: "); 1.740 + vn->dump(); 1.741 + } 1.742 +#endif 1.743 } 1.744 } 1.745 } 1.746 @@ -1207,10 +1388,10 @@ 1.747 } 1.748 1.749 if (same_opd) { 1.750 - if (opd->is_Vector() || opd->is_VectorLoad()) { 1.751 + if (opd->is_Vector() || opd->is_LoadVector()) { 1.752 return opd; // input is matching vector 1.753 } 1.754 - assert(!opd->is_VectorStore(), "such vector is not expected here"); 1.755 + assert(!opd->is_StoreVector(), "such vector is not expected here"); 1.756 // Convert scalar input to vector with the same number of elements as 1.757 // p0's vector. Use p0's type because size of operand's container in 1.758 // vector should match p0's size regardless operand's size. 1.759 @@ -1219,12 +1400,18 @@ 1.760 1.761 _phase->_igvn.register_new_node_with_optimizer(vn); 1.762 _phase->set_ctrl(vn, _phase->get_ctrl(opd)); 1.763 +#ifdef ASSERT 1.764 + if (TraceSuperWord) { 1.765 + tty->print("new Vector node: "); 1.766 + vn->dump(); 1.767 + } 1.768 +#endif 1.769 return vn; 1.770 } 1.771 1.772 // Insert pack operation 1.773 - const Type* p0_t = velt_type(p0); 1.774 - PackNode* pk = PackNode::make(_phase->C, opd, p0_t); 1.775 + BasicType bt = velt_basic_type(p0); 1.776 + PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt); 1.777 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); ) 1.778 1.779 for (uint i = 1; i < vlen; i++) { 1.780 @@ -1232,10 +1419,16 @@ 1.781 Node* in = pi->in(opd_idx); 1.782 assert(my_pack(in) == NULL, "Should already have been unpacked"); 1.783 assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); 1.784 - pk->add_opd(in); 1.785 + pk->add_opd(i, in); 1.786 } 1.787 _phase->_igvn.register_new_node_with_optimizer(pk); 1.788 _phase->set_ctrl(pk, _phase->get_ctrl(opd)); 1.789 +#ifdef ASSERT 1.790 + if (TraceSuperWord) { 1.791 + tty->print("new Pack node: "); 1.792 + pk->dump(); 1.793 + } 1.794 +#endif 1.795 return pk; 1.796 } 1.797 1.798 @@ -1273,16 +1466,15 @@ 1.799 // Insert extract operation 1.800 _igvn.hash_delete(def); 1.801 int def_pos = alignment(def) / data_size(def); 1.802 - const Type* def_t = velt_type(def); 1.803 1.804 - Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t); 1.805 + Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def)); 1.806 _phase->_igvn.register_new_node_with_optimizer(ex); 1.807 _phase->set_ctrl(ex, _phase->get_ctrl(def)); 1.808 _igvn.replace_input_of(use, idx, ex); 1.809 _igvn._worklist.push(def); 1.810 1.811 bb_insert_after(ex, bb_idx(def)); 1.812 - set_velt_type(ex, def_t); 1.813 + set_velt_type(ex, velt_type(def)); 1.814 } 1.815 } 1.816 1.817 @@ -1509,10 +1701,7 @@ 1.818 // Initial type 1.819 for (int i = 0; i < _block.length(); i++) { 1.820 Node* n = _block.at(i); 1.821 - const Type* t = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type()) 1.822 - : _igvn.type(n); 1.823 - const Type* vt = container_type(t); 1.824 - set_velt_type(n, vt); 1.825 + set_velt_type(n, container_type(n)); 1.826 } 1.827 1.828 // Propagate narrowed type backwards through operations 1.829 @@ -1543,7 +1732,7 @@ 1.830 bool same_type = true; 1.831 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { 1.832 Node *use = in->fast_out(k); 1.833 - if (!in_bb(use) || velt_type(use) != vt) { 1.834 + if (!in_bb(use) || !same_velt_type(use, n)) { 1.835 same_type = false; 1.836 break; 1.837 } 1.838 @@ -1575,20 +1764,24 @@ 1.839 if (!p.valid()) { 1.840 return bottom_align; 1.841 } 1.842 + int vw = vector_width_in_bytes(velt_basic_type(s)); 1.843 + if (vw < 2) { 1.844 + return bottom_align; // No vectors for this type 1.845 + } 1.846 int offset = p.offset_in_bytes(); 1.847 offset += iv_adjust_in_bytes; 1.848 - int off_rem = offset % vector_width_in_bytes(); 1.849 - int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes(); 1.850 + int off_rem = offset % vw; 1.851 + int off_mod = off_rem >= 0 ? off_rem : off_rem + vw; 1.852 return off_mod; 1.853 } 1.854 1.855 //---------------------------container_type--------------------------- 1.856 // Smallest type containing range of values 1.857 -const Type* SuperWord::container_type(const Type* t) { 1.858 - const Type* tp = t->make_ptr(); 1.859 - if (tp && tp->isa_aryptr()) { 1.860 - t = tp->is_aryptr()->elem(); 1.861 +const Type* SuperWord::container_type(Node* n) { 1.862 + if (n->is_Mem()) { 1.863 + return Type::get_const_basic_type(n->as_Mem()->memory_type()); 1.864 } 1.865 + const Type* t = _igvn.type(n); 1.866 if (t->basic_type() == T_INT) { 1.867 if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL; 1.868 if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE; 1.869 @@ -1599,11 +1792,22 @@ 1.870 return t; 1.871 } 1.872 1.873 +bool SuperWord::same_velt_type(Node* n1, Node* n2) { 1.874 + const Type* vt1 = velt_type(n1); 1.875 + const Type* vt2 = velt_type(n1); 1.876 + if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { 1.877 + // Compare vectors element sizes for integer types. 1.878 + return data_size(n1) == data_size(n2); 1.879 + } 1.880 + return vt1 == vt2; 1.881 +} 1.882 + 1.883 //-------------------------vector_opd_range----------------------- 1.884 // (Start, end] half-open range defining which operands are vector 1.885 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) { 1.886 switch (n->Opcode()) { 1.887 - case Op_LoadB: case Op_LoadUS: 1.888 + case Op_LoadB: case Op_LoadUB: 1.889 + case Op_LoadS: case Op_LoadUS: 1.890 case Op_LoadI: case Op_LoadL: 1.891 case Op_LoadF: case Op_LoadD: 1.892 case Op_LoadP: 1.893 @@ -1721,6 +1925,7 @@ 1.894 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, ""); 1.895 1.896 SWPointer align_to_ref_p(align_to_ref, this); 1.897 + assert(align_to_ref_p.valid(), "sanity"); 1.898 1.899 // Given: 1.900 // lim0 == original pre loop limit 1.901 @@ -1773,10 +1978,12 @@ 1.902 // N = (V - (e - lim0)) % V 1.903 // lim = lim0 - (V - (e - lim0)) % V 1.904 1.905 + int vw = vector_width_in_bytes(velt_basic_type(align_to_ref)); 1.906 + assert(vw > 1, "sanity"); 1.907 int stride = iv_stride(); 1.908 int scale = align_to_ref_p.scale_in_bytes(); 1.909 int elt_size = align_to_ref_p.memory_size(); 1.910 - int v_align = vector_width_in_bytes() / elt_size; 1.911 + int v_align = vw / elt_size; 1.912 int k = align_to_ref_p.offset_in_bytes() / elt_size; 1.913 1.914 Node *kn = _igvn.intcon(k); 1.915 @@ -1796,6 +2003,25 @@ 1.916 _phase->_igvn.register_new_node_with_optimizer(e); 1.917 _phase->set_ctrl(e, pre_ctrl); 1.918 } 1.919 + if (vw > ObjectAlignmentInBytes) { 1.920 + // incorporate base e +/- base && Mask >>> log2(elt) 1.921 + Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw))); 1.922 + Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base()); 1.923 + _phase->_igvn.register_new_node_with_optimizer(xbase); 1.924 + Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask); 1.925 + _phase->_igvn.register_new_node_with_optimizer(masked_xbase); 1.926 +#ifdef _LP64 1.927 + masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase); 1.928 + _phase->_igvn.register_new_node_with_optimizer(masked_xbase); 1.929 +#endif 1.930 + Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); 1.931 + Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt); 1.932 + _phase->_igvn.register_new_node_with_optimizer(bref); 1.933 + _phase->set_ctrl(bref, pre_ctrl); 1.934 + e = new (_phase->C, 3) AddINode(e, bref); 1.935 + _phase->_igvn.register_new_node_with_optimizer(e); 1.936 + _phase->set_ctrl(e, pre_ctrl); 1.937 + } 1.938 1.939 // compute e +/- lim0 1.940 if (scale < 0) {