1.1 --- a/src/share/vm/opto/loopTransform.cpp Fri Aug 20 09:55:50 2010 -0700 1.2 +++ b/src/share/vm/opto/loopTransform.cpp Fri Aug 27 17:33:49 2010 -0700 1.3 @@ -2049,11 +2049,18 @@ 1.4 if (cmp->Opcode() != Op_CmpU ) { 1.5 return false; 1.6 } 1.7 - if (cmp->in(2)->Opcode() != Op_LoadRange) { 1.8 - return false; 1.9 + Node* range = cmp->in(2); 1.10 + if (range->Opcode() != Op_LoadRange) { 1.11 + const TypeInt* tint = phase->_igvn.type(range)->isa_int(); 1.12 + if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) { 1.13 + // Allow predication on positive values that aren't LoadRanges. 1.14 + // This allows optimization of loops where the length of the 1.15 + // array is a known value and doesn't need to be loaded back 1.16 + // from the array. 1.17 + return false; 1.18 + } 1.19 } 1.20 - LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2); 1.21 - if (!invar.is_invariant(lr)) { // loadRange must be invariant 1.22 + if (!invar.is_invariant(range)) { 1.23 return false; 1.24 } 1.25 Node *iv = _head->as_CountedLoop()->phi(); 1.26 @@ -2248,9 +2255,9 @@ 1.27 const Node* cmp = bol->in(1)->as_Cmp(); 1.28 Node* idx = cmp->in(1); 1.29 assert(!invar.is_invariant(idx), "index is variant"); 1.30 - assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be"); 1.31 - Node* ld_rng = cmp->in(2); // LoadRangeNode 1.32 - assert(invar.is_invariant(ld_rng), "load range must be invariant"); 1.33 + assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be"); 1.34 + Node* rng = cmp->in(2); 1.35 + assert(invar.is_invariant(rng), "range must be invariant"); 1.36 int scale = 1; 1.37 Node* offset = zero; 1.38 bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); 1.39 @@ -2271,21 +2278,21 @@ 1.40 1.41 // Perform cloning to keep Invariance state correct since the 1.42 // late schedule will place invariant things in the loop. 1.43 - ld_rng = invar.clone(ld_rng, ctrl); 1.44 + rng = invar.clone(rng, ctrl); 1.45 if (offset && offset != zero) { 1.46 assert(invar.is_invariant(offset), "offset must be loop invariant"); 1.47 offset = invar.clone(offset, ctrl); 1.48 } 1.49 1.50 // Test the lower bound 1.51 - Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false); 1.52 + Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false); 1.53 IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); 1.54 _igvn.hash_delete(lower_bound_iff); 1.55 lower_bound_iff->set_req(1, lower_bound_bol); 1.56 if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx); 1.57 1.58 // Test the upper bound 1.59 - Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true); 1.60 + Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true); 1.61 IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); 1.62 _igvn.hash_delete(upper_bound_iff); 1.63 upper_bound_iff->set_req(1, upper_bound_bol); 1.64 @@ -2366,3 +2373,348 @@ 1.65 1.66 return hoisted; 1.67 } 1.68 + 1.69 + 1.70 +// Process all the loops in the loop tree and replace any fill 1.71 +// patterns with an intrisc version. 1.72 +bool PhaseIdealLoop::do_intrinsify_fill() { 1.73 + bool changed = false; 1.74 + for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { 1.75 + IdealLoopTree* lpt = iter.current(); 1.76 + changed |= intrinsify_fill(lpt); 1.77 + } 1.78 + return changed; 1.79 +} 1.80 + 1.81 + 1.82 +// Examine an inner loop looking for a a single store of an invariant 1.83 +// value in a unit stride loop, 1.84 +bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, 1.85 + Node*& shift, Node*& con) { 1.86 + const char* msg = NULL; 1.87 + Node* msg_node = NULL; 1.88 + 1.89 + store_value = NULL; 1.90 + con = NULL; 1.91 + shift = NULL; 1.92 + 1.93 + // Process the loop looking for stores. If there are multiple 1.94 + // stores or extra control flow give at this point. 1.95 + CountedLoopNode* head = lpt->_head->as_CountedLoop(); 1.96 + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { 1.97 + Node* n = lpt->_body.at(i); 1.98 + if (n->outcnt() == 0) continue; // Ignore dead 1.99 + if (n->is_Store()) { 1.100 + if (store != NULL) { 1.101 + msg = "multiple stores"; 1.102 + break; 1.103 + } 1.104 + int opc = n->Opcode(); 1.105 + if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) { 1.106 + msg = "oop fills not handled"; 1.107 + break; 1.108 + } 1.109 + Node* value = n->in(MemNode::ValueIn); 1.110 + if (!lpt->is_invariant(value)) { 1.111 + msg = "variant store value"; 1.112 + } 1.113 + store = n; 1.114 + store_value = value; 1.115 + } else if (n->is_If() && n != head->loopexit()) { 1.116 + msg = "extra control flow"; 1.117 + msg_node = n; 1.118 + } 1.119 + } 1.120 + 1.121 + if (store == NULL) { 1.122 + // No store in loop 1.123 + return false; 1.124 + } 1.125 + 1.126 + if (msg == NULL && head->stride_con() != 1) { 1.127 + // could handle negative strides too 1.128 + if (head->stride_con() < 0) { 1.129 + msg = "negative stride"; 1.130 + } else { 1.131 + msg = "non-unit stride"; 1.132 + } 1.133 + } 1.134 + 1.135 + if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) { 1.136 + msg = "can't handle store address"; 1.137 + msg_node = store->in(MemNode::Address); 1.138 + } 1.139 + 1.140 + // Make sure there is an appropriate fill routine 1.141 + BasicType t = store->as_Mem()->memory_type(); 1.142 + const char* fill_name; 1.143 + if (msg == NULL && 1.144 + StubRoutines::select_fill_function(t, false, fill_name) == NULL) { 1.145 + msg = "unsupported store"; 1.146 + msg_node = store; 1.147 + } 1.148 + 1.149 + if (msg != NULL) { 1.150 +#ifndef PRODUCT 1.151 + if (TraceOptimizeFill) { 1.152 + tty->print_cr("not fill intrinsic candidate: %s", msg); 1.153 + if (msg_node != NULL) msg_node->dump(); 1.154 + } 1.155 +#endif 1.156 + return false; 1.157 + } 1.158 + 1.159 + // Make sure the address expression can be handled. It should be 1.160 + // head->phi * elsize + con. head->phi might have a ConvI2L. 1.161 + Node* elements[4]; 1.162 + Node* conv = NULL; 1.163 + int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements)); 1.164 + for (int e = 0; e < count; e++) { 1.165 + Node* n = elements[e]; 1.166 + if (n->is_Con() && con == NULL) { 1.167 + con = n; 1.168 + } else if (n->Opcode() == Op_LShiftX && shift == NULL) { 1.169 + Node* value = n->in(1); 1.170 +#ifdef _LP64 1.171 + if (value->Opcode() == Op_ConvI2L) { 1.172 + conv = value; 1.173 + value = value->in(1); 1.174 + } 1.175 +#endif 1.176 + if (value != head->phi()) { 1.177 + msg = "unhandled shift in address"; 1.178 + } else { 1.179 + shift = n; 1.180 + assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match"); 1.181 + } 1.182 + } else if (n->Opcode() == Op_ConvI2L && conv == NULL) { 1.183 + if (n->in(1) == head->phi()) { 1.184 + conv = n; 1.185 + } else { 1.186 + msg = "unhandled input to ConvI2L"; 1.187 + } 1.188 + } else if (n == head->phi()) { 1.189 + // no shift, check below for allowed cases 1.190 + } else { 1.191 + msg = "unhandled node in address"; 1.192 + msg_node = n; 1.193 + } 1.194 + } 1.195 + 1.196 + if (count == -1) { 1.197 + msg = "malformed address expression"; 1.198 + msg_node = store; 1.199 + } 1.200 + 1.201 + // byte sized items won't have a shift 1.202 + if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) { 1.203 + msg = "can't find shift"; 1.204 + msg_node = store; 1.205 + } 1.206 + 1.207 + if (msg != NULL) { 1.208 +#ifndef PRODUCT 1.209 + if (TraceOptimizeFill) { 1.210 + tty->print_cr("not fill intrinsic: %s", msg); 1.211 + if (msg_node != NULL) msg_node->dump(); 1.212 + } 1.213 +#endif 1.214 + return false; 1.215 + } 1.216 + 1.217 + // No make sure all the other nodes in the loop can be handled 1.218 + VectorSet ok(Thread::current()->resource_area()); 1.219 + 1.220 + // store related values are ok 1.221 + ok.set(store->_idx); 1.222 + ok.set(store->in(MemNode::Memory)->_idx); 1.223 + 1.224 + // Loop structure is ok 1.225 + ok.set(head->_idx); 1.226 + ok.set(head->loopexit()->_idx); 1.227 + ok.set(head->phi()->_idx); 1.228 + ok.set(head->incr()->_idx); 1.229 + ok.set(head->loopexit()->cmp_node()->_idx); 1.230 + ok.set(head->loopexit()->in(1)->_idx); 1.231 + 1.232 + // Address elements are ok 1.233 + if (con) ok.set(con->_idx); 1.234 + if (shift) ok.set(shift->_idx); 1.235 + if (conv) ok.set(conv->_idx); 1.236 + 1.237 + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { 1.238 + Node* n = lpt->_body.at(i); 1.239 + if (n->outcnt() == 0) continue; // Ignore dead 1.240 + if (ok.test(n->_idx)) continue; 1.241 + // Backedge projection is ok 1.242 + if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue; 1.243 + if (!n->is_AddP()) { 1.244 + msg = "unhandled node"; 1.245 + msg_node = n; 1.246 + break; 1.247 + } 1.248 + } 1.249 + 1.250 + // Make sure no unexpected values are used outside the loop 1.251 + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { 1.252 + Node* n = lpt->_body.at(i); 1.253 + // These values can be replaced with other nodes if they are used 1.254 + // outside the loop. 1.255 + if (n == store || n == head->loopexit() || n == head->incr()) continue; 1.256 + for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) { 1.257 + Node* use = iter.get(); 1.258 + if (!lpt->_body.contains(use)) { 1.259 + msg = "node is used outside loop"; 1.260 + // lpt->_body.dump(); 1.261 + msg_node = n; 1.262 + break; 1.263 + } 1.264 + } 1.265 + } 1.266 + 1.267 +#ifdef ASSERT 1.268 + if (TraceOptimizeFill) { 1.269 + if (msg != NULL) { 1.270 + tty->print_cr("no fill intrinsic: %s", msg); 1.271 + if (msg_node != NULL) msg_node->dump(); 1.272 + } else { 1.273 + tty->print_cr("fill intrinsic for:"); 1.274 + } 1.275 + store->dump(); 1.276 + if (Verbose) { 1.277 + lpt->_body.dump(); 1.278 + } 1.279 + } 1.280 +#endif 1.281 + 1.282 + return msg == NULL; 1.283 +} 1.284 + 1.285 + 1.286 + 1.287 +bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) { 1.288 + // Only for counted inner loops 1.289 + if (!lpt->is_counted() || !lpt->is_inner()) { 1.290 + return false; 1.291 + } 1.292 + 1.293 + // Must have constant stride 1.294 + CountedLoopNode* head = lpt->_head->as_CountedLoop(); 1.295 + if (!head->stride_is_con() || !head->is_normal_loop()) { 1.296 + return false; 1.297 + } 1.298 + 1.299 + // Check that the body only contains a store of a loop invariant 1.300 + // value that is indexed by the loop phi. 1.301 + Node* store = NULL; 1.302 + Node* store_value = NULL; 1.303 + Node* shift = NULL; 1.304 + Node* offset = NULL; 1.305 + if (!match_fill_loop(lpt, store, store_value, shift, offset)) { 1.306 + return false; 1.307 + } 1.308 + 1.309 + // Now replace the whole loop body by a call to a fill routine that 1.310 + // covers the same region as the loop. 1.311 + Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base); 1.312 + 1.313 + // Build an expression for the beginning of the copy region 1.314 + Node* index = head->init_trip(); 1.315 +#ifdef _LP64 1.316 + index = new (C, 2) ConvI2LNode(index); 1.317 + _igvn.register_new_node_with_optimizer(index); 1.318 +#endif 1.319 + if (shift != NULL) { 1.320 + // byte arrays don't require a shift but others do. 1.321 + index = new (C, 3) LShiftXNode(index, shift->in(2)); 1.322 + _igvn.register_new_node_with_optimizer(index); 1.323 + } 1.324 + index = new (C, 4) AddPNode(base, base, index); 1.325 + _igvn.register_new_node_with_optimizer(index); 1.326 + Node* from = new (C, 4) AddPNode(base, index, offset); 1.327 + _igvn.register_new_node_with_optimizer(from); 1.328 + // Compute the number of elements to copy 1.329 + Node* len = new (C, 3) SubINode(head->limit(), head->init_trip()); 1.330 + _igvn.register_new_node_with_optimizer(len); 1.331 + 1.332 + BasicType t = store->as_Mem()->memory_type(); 1.333 + bool aligned = false; 1.334 + if (offset != NULL && head->init_trip()->is_Con()) { 1.335 + int element_size = type2aelembytes(t); 1.336 + aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0; 1.337 + } 1.338 + 1.339 + // Build a call to the fill routine 1.340 + const char* fill_name; 1.341 + address fill = StubRoutines::select_fill_function(t, aligned, fill_name); 1.342 + assert(fill != NULL, "what?"); 1.343 + 1.344 + // Convert float/double to int/long for fill routines 1.345 + if (t == T_FLOAT) { 1.346 + store_value = new (C, 2) MoveF2INode(store_value); 1.347 + _igvn.register_new_node_with_optimizer(store_value); 1.348 + } else if (t == T_DOUBLE) { 1.349 + store_value = new (C, 2) MoveD2LNode(store_value); 1.350 + _igvn.register_new_node_with_optimizer(store_value); 1.351 + } 1.352 + 1.353 + Node* mem_phi = store->in(MemNode::Memory); 1.354 + Node* result_ctrl; 1.355 + Node* result_mem; 1.356 + const TypeFunc* call_type = OptoRuntime::array_fill_Type(); 1.357 + int size = call_type->domain()->cnt(); 1.358 + CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill, 1.359 + fill_name, TypeAryPtr::get_array_body_type(t)); 1.360 + call->init_req(TypeFunc::Parms+0, from); 1.361 + call->init_req(TypeFunc::Parms+1, store_value); 1.362 + call->init_req(TypeFunc::Parms+2, len); 1.363 + call->init_req( TypeFunc::Control, head->init_control()); 1.364 + call->init_req( TypeFunc::I_O , C->top() ) ; // does no i/o 1.365 + call->init_req( TypeFunc::Memory , mem_phi->in(LoopNode::EntryControl) ); 1.366 + call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) ); 1.367 + call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) ); 1.368 + _igvn.register_new_node_with_optimizer(call); 1.369 + result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control); 1.370 + _igvn.register_new_node_with_optimizer(result_ctrl); 1.371 + result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory); 1.372 + _igvn.register_new_node_with_optimizer(result_mem); 1.373 + 1.374 + // If this fill is tightly coupled to an allocation and overwrites 1.375 + // the whole body, allow it to take over the zeroing. 1.376 + AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this); 1.377 + if (alloc != NULL && alloc->is_AllocateArray()) { 1.378 + Node* length = alloc->as_AllocateArray()->Ideal_length(); 1.379 + if (head->limit() == length && 1.380 + head->init_trip() == _igvn.intcon(0)) { 1.381 + if (TraceOptimizeFill) { 1.382 + tty->print_cr("Eliminated zeroing in allocation"); 1.383 + } 1.384 + alloc->maybe_set_complete(&_igvn); 1.385 + } else { 1.386 +#ifdef ASSERT 1.387 + if (TraceOptimizeFill) { 1.388 + tty->print_cr("filling array but bounds don't match"); 1.389 + alloc->dump(); 1.390 + head->init_trip()->dump(); 1.391 + head->limit()->dump(); 1.392 + length->dump(); 1.393 + } 1.394 +#endif 1.395 + } 1.396 + } 1.397 + 1.398 + // Redirect the old control and memory edges that are outside the loop. 1.399 + Node* exit = head->loopexit()->proj_out(0); 1.400 + _igvn.replace_node(exit, result_ctrl); 1.401 + _igvn.replace_node(store, result_mem); 1.402 + // Any uses the increment outside of the loop become the loop limit. 1.403 + _igvn.replace_node(head->incr(), head->limit()); 1.404 + 1.405 + // Disconnect the head from the loop. 1.406 + for (uint i = 0; i < lpt->_body.size(); i++) { 1.407 + Node* n = lpt->_body.at(i); 1.408 + _igvn.replace_node(n, C->top()); 1.409 + } 1.410 + 1.411 + return true; 1.412 +}