src/share/vm/opto/loopTransform.cpp

changeset 2118
d6f45b55c972
parent 1976
6027dddc26c6
child 2140
5e4f03302987
     1.1 --- a/src/share/vm/opto/loopTransform.cpp	Fri Aug 20 09:55:50 2010 -0700
     1.2 +++ b/src/share/vm/opto/loopTransform.cpp	Fri Aug 27 17:33:49 2010 -0700
     1.3 @@ -2049,11 +2049,18 @@
     1.4    if (cmp->Opcode() != Op_CmpU ) {
     1.5      return false;
     1.6    }
     1.7 -  if (cmp->in(2)->Opcode() != Op_LoadRange) {
     1.8 -    return false;
     1.9 +  Node* range = cmp->in(2);
    1.10 +  if (range->Opcode() != Op_LoadRange) {
    1.11 +    const TypeInt* tint = phase->_igvn.type(range)->isa_int();
    1.12 +    if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) {
    1.13 +      // Allow predication on positive values that aren't LoadRanges.
    1.14 +      // This allows optimization of loops where the length of the
    1.15 +      // array is a known value and doesn't need to be loaded back
    1.16 +      // from the array.
    1.17 +      return false;
    1.18 +    }
    1.19    }
    1.20 -  LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2);
    1.21 -  if (!invar.is_invariant(lr)) { // loadRange must be invariant
    1.22 +  if (!invar.is_invariant(range)) {
    1.23      return false;
    1.24    }
    1.25    Node *iv     = _head->as_CountedLoop()->phi();
    1.26 @@ -2248,9 +2255,9 @@
    1.27        const Node*    cmp    = bol->in(1)->as_Cmp();
    1.28        Node*          idx    = cmp->in(1);
    1.29        assert(!invar.is_invariant(idx), "index is variant");
    1.30 -      assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be");
    1.31 -      Node* ld_rng = cmp->in(2); // LoadRangeNode
    1.32 -      assert(invar.is_invariant(ld_rng), "load range must be invariant");
    1.33 +      assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be");
    1.34 +      Node* rng = cmp->in(2);
    1.35 +      assert(invar.is_invariant(rng), "range must be invariant");
    1.36        int scale    = 1;
    1.37        Node* offset = zero;
    1.38        bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
    1.39 @@ -2271,21 +2278,21 @@
    1.40  
    1.41        // Perform cloning to keep Invariance state correct since the
    1.42        // late schedule will place invariant things in the loop.
    1.43 -      ld_rng = invar.clone(ld_rng, ctrl);
    1.44 +      rng = invar.clone(rng, ctrl);
    1.45        if (offset && offset != zero) {
    1.46          assert(invar.is_invariant(offset), "offset must be loop invariant");
    1.47          offset = invar.clone(offset, ctrl);
    1.48        }
    1.49  
    1.50        // Test the lower bound
    1.51 -      Node*  lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false);
    1.52 +      Node*  lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false);
    1.53        IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
    1.54        _igvn.hash_delete(lower_bound_iff);
    1.55        lower_bound_iff->set_req(1, lower_bound_bol);
    1.56        if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);
    1.57  
    1.58        // Test the upper bound
    1.59 -      Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true);
    1.60 +      Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true);
    1.61        IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
    1.62        _igvn.hash_delete(upper_bound_iff);
    1.63        upper_bound_iff->set_req(1, upper_bound_bol);
    1.64 @@ -2366,3 +2373,348 @@
    1.65  
    1.66    return hoisted;
    1.67  }
    1.68 +
    1.69 +
    1.70 +// Process all the loops in the loop tree and replace any fill
    1.71 +// patterns with an intrisc version.
    1.72 +bool PhaseIdealLoop::do_intrinsify_fill() {
    1.73 +  bool changed = false;
    1.74 +  for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
    1.75 +    IdealLoopTree* lpt = iter.current();
    1.76 +    changed |= intrinsify_fill(lpt);
    1.77 +  }
    1.78 +  return changed;
    1.79 +}
    1.80 +
    1.81 +
    1.82 +// Examine an inner loop looking for a a single store of an invariant
    1.83 +// value in a unit stride loop,
    1.84 +bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
    1.85 +                                     Node*& shift, Node*& con) {
    1.86 +  const char* msg = NULL;
    1.87 +  Node* msg_node = NULL;
    1.88 +
    1.89 +  store_value = NULL;
    1.90 +  con = NULL;
    1.91 +  shift = NULL;
    1.92 +
    1.93 +  // Process the loop looking for stores.  If there are multiple
    1.94 +  // stores or extra control flow give at this point.
    1.95 +  CountedLoopNode* head = lpt->_head->as_CountedLoop();
    1.96 +  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
    1.97 +    Node* n = lpt->_body.at(i);
    1.98 +    if (n->outcnt() == 0) continue; // Ignore dead
    1.99 +    if (n->is_Store()) {
   1.100 +      if (store != NULL) {
   1.101 +        msg = "multiple stores";
   1.102 +        break;
   1.103 +      }
   1.104 +      int opc = n->Opcode();
   1.105 +      if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) {
   1.106 +        msg = "oop fills not handled";
   1.107 +        break;
   1.108 +      }
   1.109 +      Node* value = n->in(MemNode::ValueIn);
   1.110 +      if (!lpt->is_invariant(value)) {
   1.111 +        msg  = "variant store value";
   1.112 +      }
   1.113 +      store = n;
   1.114 +      store_value = value;
   1.115 +    } else if (n->is_If() && n != head->loopexit()) {
   1.116 +      msg = "extra control flow";
   1.117 +      msg_node = n;
   1.118 +    }
   1.119 +  }
   1.120 +
   1.121 +  if (store == NULL) {
   1.122 +    // No store in loop
   1.123 +    return false;
   1.124 +  }
   1.125 +
   1.126 +  if (msg == NULL && head->stride_con() != 1) {
   1.127 +    // could handle negative strides too
   1.128 +    if (head->stride_con() < 0) {
   1.129 +      msg = "negative stride";
   1.130 +    } else {
   1.131 +      msg = "non-unit stride";
   1.132 +    }
   1.133 +  }
   1.134 +
   1.135 +  if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) {
   1.136 +    msg = "can't handle store address";
   1.137 +    msg_node = store->in(MemNode::Address);
   1.138 +  }
   1.139 +
   1.140 +  // Make sure there is an appropriate fill routine
   1.141 +  BasicType t = store->as_Mem()->memory_type();
   1.142 +  const char* fill_name;
   1.143 +  if (msg == NULL &&
   1.144 +      StubRoutines::select_fill_function(t, false, fill_name) == NULL) {
   1.145 +    msg = "unsupported store";
   1.146 +    msg_node = store;
   1.147 +  }
   1.148 +
   1.149 +  if (msg != NULL) {
   1.150 +#ifndef PRODUCT
   1.151 +    if (TraceOptimizeFill) {
   1.152 +      tty->print_cr("not fill intrinsic candidate: %s", msg);
   1.153 +      if (msg_node != NULL) msg_node->dump();
   1.154 +    }
   1.155 +#endif
   1.156 +    return false;
   1.157 +  }
   1.158 +
   1.159 +  // Make sure the address expression can be handled.  It should be
   1.160 +  // head->phi * elsize + con.  head->phi might have a ConvI2L.
   1.161 +  Node* elements[4];
   1.162 +  Node* conv = NULL;
   1.163 +  int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
   1.164 +  for (int e = 0; e < count; e++) {
   1.165 +    Node* n = elements[e];
   1.166 +    if (n->is_Con() && con == NULL) {
   1.167 +      con = n;
   1.168 +    } else if (n->Opcode() == Op_LShiftX && shift == NULL) {
   1.169 +      Node* value = n->in(1);
   1.170 +#ifdef _LP64
   1.171 +      if (value->Opcode() == Op_ConvI2L) {
   1.172 +        conv = value;
   1.173 +        value = value->in(1);
   1.174 +      }
   1.175 +#endif
   1.176 +      if (value != head->phi()) {
   1.177 +        msg = "unhandled shift in address";
   1.178 +      } else {
   1.179 +        shift = n;
   1.180 +        assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match");
   1.181 +      }
   1.182 +    } else if (n->Opcode() == Op_ConvI2L && conv == NULL) {
   1.183 +      if (n->in(1) == head->phi()) {
   1.184 +        conv = n;
   1.185 +      } else {
   1.186 +        msg = "unhandled input to ConvI2L";
   1.187 +      }
   1.188 +    } else if (n == head->phi()) {
   1.189 +      // no shift, check below for allowed cases
   1.190 +    } else {
   1.191 +      msg = "unhandled node in address";
   1.192 +      msg_node = n;
   1.193 +    }
   1.194 +  }
   1.195 +
   1.196 +  if (count == -1) {
   1.197 +    msg = "malformed address expression";
   1.198 +    msg_node = store;
   1.199 +  }
   1.200 +
   1.201 +  // byte sized items won't have a shift
   1.202 +  if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) {
   1.203 +    msg = "can't find shift";
   1.204 +    msg_node = store;
   1.205 +  }
   1.206 +
   1.207 +  if (msg != NULL) {
   1.208 +#ifndef PRODUCT
   1.209 +    if (TraceOptimizeFill) {
   1.210 +      tty->print_cr("not fill intrinsic: %s", msg);
   1.211 +      if (msg_node != NULL) msg_node->dump();
   1.212 +    }
   1.213 +#endif
   1.214 +    return false;
   1.215 +  }
   1.216 +
   1.217 +  // No make sure all the other nodes in the loop can be handled
   1.218 +  VectorSet ok(Thread::current()->resource_area());
   1.219 +
   1.220 +  // store related values are ok
   1.221 +  ok.set(store->_idx);
   1.222 +  ok.set(store->in(MemNode::Memory)->_idx);
   1.223 +
   1.224 +  // Loop structure is ok
   1.225 +  ok.set(head->_idx);
   1.226 +  ok.set(head->loopexit()->_idx);
   1.227 +  ok.set(head->phi()->_idx);
   1.228 +  ok.set(head->incr()->_idx);
   1.229 +  ok.set(head->loopexit()->cmp_node()->_idx);
   1.230 +  ok.set(head->loopexit()->in(1)->_idx);
   1.231 +
   1.232 +  // Address elements are ok
   1.233 +  if (con)   ok.set(con->_idx);
   1.234 +  if (shift) ok.set(shift->_idx);
   1.235 +  if (conv)  ok.set(conv->_idx);
   1.236 +
   1.237 +  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
   1.238 +    Node* n = lpt->_body.at(i);
   1.239 +    if (n->outcnt() == 0) continue; // Ignore dead
   1.240 +    if (ok.test(n->_idx)) continue;
   1.241 +    // Backedge projection is ok
   1.242 +    if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue;
   1.243 +    if (!n->is_AddP()) {
   1.244 +      msg = "unhandled node";
   1.245 +      msg_node = n;
   1.246 +      break;
   1.247 +    }
   1.248 +  }
   1.249 +
   1.250 +  // Make sure no unexpected values are used outside the loop
   1.251 +  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
   1.252 +    Node* n = lpt->_body.at(i);
   1.253 +    // These values can be replaced with other nodes if they are used
   1.254 +    // outside the loop.
   1.255 +    if (n == store || n == head->loopexit() || n == head->incr()) continue;
   1.256 +    for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
   1.257 +      Node* use = iter.get();
   1.258 +      if (!lpt->_body.contains(use)) {
   1.259 +        msg = "node is used outside loop";
   1.260 +        // lpt->_body.dump();
   1.261 +        msg_node = n;
   1.262 +        break;
   1.263 +      }
   1.264 +    }
   1.265 +  }
   1.266 +
   1.267 +#ifdef ASSERT
   1.268 +  if (TraceOptimizeFill) {
   1.269 +    if (msg != NULL) {
   1.270 +      tty->print_cr("no fill intrinsic: %s", msg);
   1.271 +      if (msg_node != NULL) msg_node->dump();
   1.272 +    } else {
   1.273 +      tty->print_cr("fill intrinsic for:");
   1.274 +    }
   1.275 +    store->dump();
   1.276 +    if (Verbose) {
   1.277 +      lpt->_body.dump();
   1.278 +    }
   1.279 +  }
   1.280 +#endif
   1.281 +
   1.282 +  return msg == NULL;
   1.283 +}
   1.284 +
   1.285 +
   1.286 +
   1.287 +bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) {
   1.288 +  // Only for counted inner loops
   1.289 +  if (!lpt->is_counted() || !lpt->is_inner()) {
   1.290 +    return false;
   1.291 +  }
   1.292 +
   1.293 +  // Must have constant stride
   1.294 +  CountedLoopNode* head = lpt->_head->as_CountedLoop();
   1.295 +  if (!head->stride_is_con() || !head->is_normal_loop()) {
   1.296 +    return false;
   1.297 +  }
   1.298 +
   1.299 +  // Check that the body only contains a store of a loop invariant
   1.300 +  // value that is indexed by the loop phi.
   1.301 +  Node* store = NULL;
   1.302 +  Node* store_value = NULL;
   1.303 +  Node* shift = NULL;
   1.304 +  Node* offset = NULL;
   1.305 +  if (!match_fill_loop(lpt, store, store_value, shift, offset)) {
   1.306 +    return false;
   1.307 +  }
   1.308 +
   1.309 +  // Now replace the whole loop body by a call to a fill routine that
   1.310 +  // covers the same region as the loop.
   1.311 +  Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base);
   1.312 +
   1.313 +  // Build an expression for the beginning of the copy region
   1.314 +  Node* index = head->init_trip();
   1.315 +#ifdef _LP64
   1.316 +  index = new (C, 2) ConvI2LNode(index);
   1.317 +  _igvn.register_new_node_with_optimizer(index);
   1.318 +#endif
   1.319 +  if (shift != NULL) {
   1.320 +    // byte arrays don't require a shift but others do.
   1.321 +    index = new (C, 3) LShiftXNode(index, shift->in(2));
   1.322 +    _igvn.register_new_node_with_optimizer(index);
   1.323 +  }
   1.324 +  index = new (C, 4) AddPNode(base, base, index);
   1.325 +  _igvn.register_new_node_with_optimizer(index);
   1.326 +  Node* from = new (C, 4) AddPNode(base, index, offset);
   1.327 +  _igvn.register_new_node_with_optimizer(from);
   1.328 +  // Compute the number of elements to copy
   1.329 +  Node* len = new (C, 3) SubINode(head->limit(), head->init_trip());
   1.330 +  _igvn.register_new_node_with_optimizer(len);
   1.331 +
   1.332 +  BasicType t = store->as_Mem()->memory_type();
   1.333 +  bool aligned = false;
   1.334 +  if (offset != NULL && head->init_trip()->is_Con()) {
   1.335 +    int element_size = type2aelembytes(t);
   1.336 +    aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0;
   1.337 +  }
   1.338 +
   1.339 +  // Build a call to the fill routine
   1.340 +  const char* fill_name;
   1.341 +  address fill = StubRoutines::select_fill_function(t, aligned, fill_name);
   1.342 +  assert(fill != NULL, "what?");
   1.343 +
   1.344 +  // Convert float/double to int/long for fill routines
   1.345 +  if (t == T_FLOAT) {
   1.346 +    store_value = new (C, 2) MoveF2INode(store_value);
   1.347 +    _igvn.register_new_node_with_optimizer(store_value);
   1.348 +  } else if (t == T_DOUBLE) {
   1.349 +    store_value = new (C, 2) MoveD2LNode(store_value);
   1.350 +    _igvn.register_new_node_with_optimizer(store_value);
   1.351 +  }
   1.352 +
   1.353 +  Node* mem_phi = store->in(MemNode::Memory);
   1.354 +  Node* result_ctrl;
   1.355 +  Node* result_mem;
   1.356 +  const TypeFunc* call_type = OptoRuntime::array_fill_Type();
   1.357 +  int size = call_type->domain()->cnt();
   1.358 +  CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill,
   1.359 +                                                      fill_name, TypeAryPtr::get_array_body_type(t));
   1.360 +  call->init_req(TypeFunc::Parms+0, from);
   1.361 +  call->init_req(TypeFunc::Parms+1, store_value);
   1.362 +  call->init_req(TypeFunc::Parms+2, len);
   1.363 +  call->init_req( TypeFunc::Control, head->init_control());
   1.364 +  call->init_req( TypeFunc::I_O    , C->top() )        ;   // does no i/o
   1.365 +  call->init_req( TypeFunc::Memory ,  mem_phi->in(LoopNode::EntryControl) );
   1.366 +  call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) );
   1.367 +  call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) );
   1.368 +  _igvn.register_new_node_with_optimizer(call);
   1.369 +  result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control);
   1.370 +  _igvn.register_new_node_with_optimizer(result_ctrl);
   1.371 +  result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory);
   1.372 +  _igvn.register_new_node_with_optimizer(result_mem);
   1.373 +
   1.374 +  // If this fill is tightly coupled to an allocation and overwrites
   1.375 +  // the whole body, allow it to take over the zeroing.
   1.376 +  AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this);
   1.377 +  if (alloc != NULL && alloc->is_AllocateArray()) {
   1.378 +    Node* length = alloc->as_AllocateArray()->Ideal_length();
   1.379 +    if (head->limit() == length &&
   1.380 +        head->init_trip() == _igvn.intcon(0)) {
   1.381 +      if (TraceOptimizeFill) {
   1.382 +        tty->print_cr("Eliminated zeroing in allocation");
   1.383 +      }
   1.384 +      alloc->maybe_set_complete(&_igvn);
   1.385 +    } else {
   1.386 +#ifdef ASSERT
   1.387 +      if (TraceOptimizeFill) {
   1.388 +        tty->print_cr("filling array but bounds don't match");
   1.389 +        alloc->dump();
   1.390 +        head->init_trip()->dump();
   1.391 +        head->limit()->dump();
   1.392 +        length->dump();
   1.393 +      }
   1.394 +#endif
   1.395 +    }
   1.396 +  }
   1.397 +
   1.398 +  // Redirect the old control and memory edges that are outside the loop.
   1.399 +  Node* exit = head->loopexit()->proj_out(0);
   1.400 +  _igvn.replace_node(exit, result_ctrl);
   1.401 +  _igvn.replace_node(store, result_mem);
   1.402 +  // Any uses the increment outside of the loop become the loop limit.
   1.403 +  _igvn.replace_node(head->incr(), head->limit());
   1.404 +
   1.405 +  // Disconnect the head from the loop.
   1.406 +  for (uint i = 0; i < lpt->_body.size(); i++) {
   1.407 +    Node* n = lpt->_body.at(i);
   1.408 +    _igvn.replace_node(n, C->top());
   1.409 +  }
   1.410 +
   1.411 +  return true;
   1.412 +}

mercurial