src/share/vm/opto/superword.cpp

changeset 3882
8c92982cbbc4
parent 3847
5e990493719e
child 3886
6f8f439e247d
     1.1 --- a/src/share/vm/opto/superword.cpp	Thu Jun 14 14:59:52 2012 -0700
     1.2 +++ b/src/share/vm/opto/superword.cpp	Fri Jun 15 01:25:19 2012 -0700
     1.3 @@ -1,5 +1,5 @@
     1.4  /*
     1.5 - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
     1.6 + * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
     1.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.8   *
     1.9   * This code is free software; you can redistribute it and/or modify it
    1.10 @@ -67,6 +67,10 @@
    1.11  
    1.12  //------------------------------transform_loop---------------------------
    1.13  void SuperWord::transform_loop(IdealLoopTree* lpt) {
    1.14 +  assert(UseSuperWord, "should be");
    1.15 +  // Do vectors exist on this architecture?
    1.16 +  if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
    1.17 +
    1.18    assert(lpt->_head->is_CountedLoop(), "must be");
    1.19    CountedLoopNode *cl = lpt->_head->as_CountedLoop();
    1.20  
    1.21 @@ -89,15 +93,12 @@
    1.22    Node *pre_opaq1 = pre_end->limit();
    1.23    if (pre_opaq1->Opcode() != Op_Opaque1) return;
    1.24  
    1.25 -  // Do vectors exist on this architecture?
    1.26 -  if (vector_width_in_bytes() == 0) return;
    1.27 -
    1.28    init(); // initialize data structures
    1.29  
    1.30    set_lpt(lpt);
    1.31    set_lp(cl);
    1.32  
    1.33 - // For now, define one block which is the entire loop body
    1.34 +  // For now, define one block which is the entire loop body
    1.35    set_bb(cl);
    1.36  
    1.37    assert(_packset.length() == 0, "packset must be empty");
    1.38 @@ -177,7 +178,7 @@
    1.39    Node_List memops;
    1.40    for (int i = 0; i < _block.length(); i++) {
    1.41      Node* n = _block.at(i);
    1.42 -    if (n->is_Mem() && in_bb(n) &&
    1.43 +    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
    1.44          is_java_primitive(n->as_Mem()->memory_type())) {
    1.45        int align = memory_alignment(n->as_Mem(), 0);
    1.46        if (align != bottom_align) {
    1.47 @@ -185,54 +186,130 @@
    1.48        }
    1.49      }
    1.50    }
    1.51 -  if (memops.size() == 0) return;
    1.52  
    1.53 -  // Find a memory reference to align to.  The pre-loop trip count
    1.54 -  // is modified to align this reference to a vector-aligned address
    1.55 -  find_align_to_ref(memops);
    1.56 -  if (align_to_ref() == NULL) return;
    1.57 +  Node_List align_to_refs;
    1.58 +  int best_iv_adjustment = 0;
    1.59 +  MemNode* best_align_to_mem_ref = NULL;
    1.60  
    1.61 -  SWPointer align_to_ref_p(align_to_ref(), this);
    1.62 -  int offset = align_to_ref_p.offset_in_bytes();
    1.63 -  int scale  = align_to_ref_p.scale_in_bytes();
    1.64 -  int vw              = vector_width_in_bytes();
    1.65 -  int stride_sign     = (scale * iv_stride()) > 0 ? 1 : -1;
    1.66 -  int iv_adjustment   = (stride_sign * vw - (offset % vw)) % vw;
    1.67 +  while (memops.size() != 0) {
    1.68 +    // Find a memory reference to align to.
    1.69 +    MemNode* mem_ref = find_align_to_ref(memops);
    1.70 +    if (mem_ref == NULL) break;
    1.71 +    align_to_refs.push(mem_ref);
    1.72 +    int iv_adjustment = get_iv_adjustment(mem_ref);
    1.73  
    1.74 -#ifndef PRODUCT
    1.75 -  if (TraceSuperWord)
    1.76 -    tty->print_cr("\noffset = %d iv_adjustment = %d  elt_align = %d scale = %d iv_stride = %d",
    1.77 -                  offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride());
    1.78 -#endif
    1.79 +    if (best_align_to_mem_ref == NULL) {
    1.80 +      // Set memory reference which is the best from all memory operations
    1.81 +      // to be used for alignment. The pre-loop trip count is modified to align
    1.82 +      // this reference to a vector-aligned address.
    1.83 +      best_align_to_mem_ref = mem_ref;
    1.84 +      best_iv_adjustment = iv_adjustment;
    1.85 +    }
    1.86  
    1.87 -  // Set alignment relative to "align_to_ref"
    1.88 -  for (int i = memops.size() - 1; i >= 0; i--) {
    1.89 -    MemNode* s = memops.at(i)->as_Mem();
    1.90 -    SWPointer p2(s, this);
    1.91 -    if (p2.comparable(align_to_ref_p)) {
    1.92 -      int align = memory_alignment(s, iv_adjustment);
    1.93 -      set_alignment(s, align);
    1.94 -    } else {
    1.95 -      memops.remove(i);
    1.96 -    }
    1.97 -  }
    1.98 -
    1.99 -  // Create initial pack pairs of memory operations
   1.100 -  for (uint i = 0; i < memops.size(); i++) {
   1.101 -    Node* s1 = memops.at(i);
   1.102 -    for (uint j = 0; j < memops.size(); j++) {
   1.103 -      Node* s2 = memops.at(j);
   1.104 -      if (s1 != s2 && are_adjacent_refs(s1, s2)) {
   1.105 -        int align = alignment(s1);
   1.106 -        if (stmts_can_pack(s1, s2, align)) {
   1.107 -          Node_List* pair = new Node_List();
   1.108 -          pair->push(s1);
   1.109 -          pair->push(s2);
   1.110 -          _packset.append(pair);
   1.111 +    SWPointer align_to_ref_p(mem_ref, this);
   1.112 +    // Set alignment relative to "align_to_ref" for all related memory operations.
   1.113 +    for (int i = memops.size() - 1; i >= 0; i--) {
   1.114 +      MemNode* s = memops.at(i)->as_Mem();
   1.115 +      if (isomorphic(s, mem_ref)) {
   1.116 +        SWPointer p2(s, this);
   1.117 +        if (p2.comparable(align_to_ref_p)) {
   1.118 +          int align = memory_alignment(s, iv_adjustment);
   1.119 +          set_alignment(s, align);
   1.120          }
   1.121        }
   1.122      }
   1.123 -  }
   1.124 +
   1.125 +    // Create initial pack pairs of memory operations for which
   1.126 +    // alignment is set and vectors will be aligned.
   1.127 +    bool create_pack = true;
   1.128 +    if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
   1.129 +      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
   1.130 +        // Can't allow vectorization of unaligned memory accesses with the
   1.131 +        // same type since it could be overlapped accesses to the same array.
   1.132 +        create_pack = false;
   1.133 +      } else {
   1.134 +        // Allow independent (different type) unaligned memory operations
   1.135 +        // if HW supports them.
   1.136 +        if (!Matcher::misaligned_vectors_ok()) {
   1.137 +          create_pack = false;
   1.138 +        } else {
   1.139 +          // Check if packs of the same memory type but
   1.140 +          // with a different alignment were created before.
   1.141 +          for (uint i = 0; i < align_to_refs.size(); i++) {
   1.142 +            MemNode* mr = align_to_refs.at(i)->as_Mem();
   1.143 +            if (same_velt_type(mr, mem_ref) &&
   1.144 +                memory_alignment(mr, iv_adjustment) != 0)
   1.145 +              create_pack = false;
   1.146 +          }
   1.147 +        }
   1.148 +      }
   1.149 +    }
   1.150 +    if (create_pack) {
   1.151 +      for (uint i = 0; i < memops.size(); i++) {
   1.152 +        Node* s1 = memops.at(i);
   1.153 +        int align = alignment(s1);
   1.154 +        if (align == top_align) continue;
   1.155 +        for (uint j = 0; j < memops.size(); j++) {
   1.156 +          Node* s2 = memops.at(j);
   1.157 +          if (alignment(s2) == top_align) continue;
   1.158 +          if (s1 != s2 && are_adjacent_refs(s1, s2)) {
   1.159 +            if (stmts_can_pack(s1, s2, align)) {
   1.160 +              Node_List* pair = new Node_List();
   1.161 +              pair->push(s1);
   1.162 +              pair->push(s2);
   1.163 +              _packset.append(pair);
   1.164 +            }
   1.165 +          }
   1.166 +        }
   1.167 +      }
   1.168 +    } else { // Don't create unaligned pack
   1.169 +      // First, remove remaining memory ops of the same type from the list.
   1.170 +      for (int i = memops.size() - 1; i >= 0; i--) {
   1.171 +        MemNode* s = memops.at(i)->as_Mem();
   1.172 +        if (same_velt_type(s, mem_ref)) {
   1.173 +          memops.remove(i);
   1.174 +        }
   1.175 +      }
   1.176 +
   1.177 +      // Second, remove already constructed packs of the same type.
   1.178 +      for (int i = _packset.length() - 1; i >= 0; i--) {
   1.179 +        Node_List* p = _packset.at(i);
   1.180 +        MemNode* s = p->at(0)->as_Mem();
   1.181 +        if (same_velt_type(s, mem_ref)) {
   1.182 +          remove_pack_at(i);
   1.183 +        }
   1.184 +      }
   1.185 +
   1.186 +      // If needed find the best memory reference for loop alignment again.
   1.187 +      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
   1.188 +        // Put memory ops from remaining packs back on memops list for
   1.189 +        // the best alignment search.
   1.190 +        uint orig_msize = memops.size();
   1.191 +        for (int i = 0; i < _packset.length(); i++) {
   1.192 +          Node_List* p = _packset.at(i);
   1.193 +          MemNode* s = p->at(0)->as_Mem();
   1.194 +          assert(!same_velt_type(s, mem_ref), "sanity");
   1.195 +          memops.push(s);
   1.196 +        }
   1.197 +        MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
   1.198 +        if (best_align_to_mem_ref == NULL) break;
   1.199 +        best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
   1.200 +        // Restore list.
   1.201 +        while (memops.size() > orig_msize)
   1.202 +          (void)memops.pop();
   1.203 +      }
   1.204 +    } // unaligned memory accesses
   1.205 +
   1.206 +    // Remove used mem nodes.
   1.207 +    for (int i = memops.size() - 1; i >= 0; i--) {
   1.208 +      MemNode* m = memops.at(i)->as_Mem();
   1.209 +      if (alignment(m) != top_align) {
   1.210 +        memops.remove(i);
   1.211 +      }
   1.212 +    }
   1.213 +
   1.214 +  } // while (memops.size() != 0
   1.215 +  set_align_to_ref(best_align_to_mem_ref);
   1.216  
   1.217  #ifndef PRODUCT
   1.218    if (TraceSuperWord) {
   1.219 @@ -246,7 +323,7 @@
   1.220  // Find a memory reference to align the loop induction variable to.
   1.221  // Looks first at stores then at loads, looking for a memory reference
   1.222  // with the largest number of references similar to it.
   1.223 -void SuperWord::find_align_to_ref(Node_List &memops) {
   1.224 +MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
   1.225    GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
   1.226  
   1.227    // Count number of comparable memory ops
   1.228 @@ -270,20 +347,28 @@
   1.229      }
   1.230    }
   1.231  
   1.232 -  // Find Store (or Load) with the greatest number of "comparable" references
   1.233 +  // Find Store (or Load) with the greatest number of "comparable" references,
   1.234 +  // biggest vector size, smallest data size and smallest iv offset.
   1.235    int max_ct        = 0;
   1.236 +  int max_vw        = 0;
   1.237    int max_idx       = -1;
   1.238    int min_size      = max_jint;
   1.239    int min_iv_offset = max_jint;
   1.240    for (uint j = 0; j < memops.size(); j++) {
   1.241      MemNode* s = memops.at(j)->as_Mem();
   1.242      if (s->is_Store()) {
   1.243 +      int vw = vector_width_in_bytes(velt_basic_type(s));
   1.244 +      assert(vw > 1, "sanity");
   1.245        SWPointer p(s, this);
   1.246 -      if (cmp_ct.at(j) > max_ct ||
   1.247 -          cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
   1.248 -                                     data_size(s) == min_size &&
   1.249 -                                        p.offset_in_bytes() < min_iv_offset)) {
   1.250 +      if (cmp_ct.at(j) >  max_ct ||
   1.251 +          cmp_ct.at(j) == max_ct &&
   1.252 +            (vw >  max_vw ||
   1.253 +             vw == max_vw &&
   1.254 +              (data_size(s) <  min_size ||
   1.255 +               data_size(s) == min_size &&
   1.256 +                 (p.offset_in_bytes() < min_iv_offset)))) {
   1.257          max_ct = cmp_ct.at(j);
   1.258 +        max_vw = vw;
   1.259          max_idx = j;
   1.260          min_size = data_size(s);
   1.261          min_iv_offset = p.offset_in_bytes();
   1.262 @@ -295,12 +380,18 @@
   1.263      for (uint j = 0; j < memops.size(); j++) {
   1.264        MemNode* s = memops.at(j)->as_Mem();
   1.265        if (s->is_Load()) {
   1.266 +        int vw = vector_width_in_bytes(velt_basic_type(s));
   1.267 +        assert(vw > 1, "sanity");
   1.268          SWPointer p(s, this);
   1.269 -        if (cmp_ct.at(j) > max_ct ||
   1.270 -            cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
   1.271 -                                       data_size(s) == min_size &&
   1.272 -                                          p.offset_in_bytes() < min_iv_offset)) {
   1.273 +        if (cmp_ct.at(j) >  max_ct ||
   1.274 +            cmp_ct.at(j) == max_ct &&
   1.275 +              (vw >  max_vw ||
   1.276 +               vw == max_vw &&
   1.277 +                (data_size(s) <  min_size ||
   1.278 +                 data_size(s) == min_size &&
   1.279 +                   (p.offset_in_bytes() < min_iv_offset)))) {
   1.280            max_ct = cmp_ct.at(j);
   1.281 +          max_vw = vw;
   1.282            max_idx = j;
   1.283            min_size = data_size(s);
   1.284            min_iv_offset = p.offset_in_bytes();
   1.285 @@ -309,10 +400,7 @@
   1.286      }
   1.287    }
   1.288  
   1.289 -  if (max_ct > 0)
   1.290 -    set_align_to_ref(memops.at(max_idx)->as_Mem());
   1.291 -
   1.292 -#ifndef PRODUCT
   1.293 +#ifdef ASSERT
   1.294    if (TraceSuperWord && Verbose) {
   1.295      tty->print_cr("\nVector memops after find_align_to_refs");
   1.296      for (uint i = 0; i < memops.size(); i++) {
   1.297 @@ -321,6 +409,17 @@
   1.298      }
   1.299    }
   1.300  #endif
   1.301 +
   1.302 +  if (max_ct > 0) {
   1.303 +#ifdef ASSERT
   1.304 +    if (TraceSuperWord) {
   1.305 +      tty->print("\nVector align to node: ");
   1.306 +      memops.at(max_idx)->as_Mem()->dump();
   1.307 +    }
   1.308 +#endif
   1.309 +    return memops.at(max_idx)->as_Mem();
   1.310 +  }
   1.311 +  return NULL;
   1.312  }
   1.313  
   1.314  //------------------------------ref_is_alignable---------------------------
   1.315 @@ -341,7 +440,9 @@
   1.316  
   1.317    // If initial offset from start of object is computable,
   1.318    // compute alignment within the vector.
   1.319 -  int vw = vector_width_in_bytes();
   1.320 +  BasicType bt = velt_basic_type(p.mem());
   1.321 +  int vw = vector_width_in_bytes(bt);
   1.322 +  assert(vw > 1, "sanity");
   1.323    if (vw % span == 0) {
   1.324      Node* init_nd = pre_end->init_trip();
   1.325      if (init_nd->is_Con() && p.invar() == NULL) {
   1.326 @@ -361,6 +462,26 @@
   1.327    return false;
   1.328  }
   1.329  
   1.330 +//---------------------------get_iv_adjustment---------------------------
   1.331 +// Calculate loop's iv adjustment for this memory ops.
   1.332 +int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
   1.333 +  SWPointer align_to_ref_p(mem_ref, this);
   1.334 +  int offset = align_to_ref_p.offset_in_bytes();
   1.335 +  int scale  = align_to_ref_p.scale_in_bytes();
   1.336 +  BasicType bt = velt_basic_type(mem_ref);
   1.337 +  int vw       = vector_width_in_bytes(bt);
   1.338 +  assert(vw > 1, "sanity");
   1.339 +  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
   1.340 +  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
   1.341 +
   1.342 +#ifndef PRODUCT
   1.343 +  if (TraceSuperWord)
   1.344 +    tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
   1.345 +                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
   1.346 +#endif
   1.347 +  return iv_adjustment;
   1.348 +}
   1.349 +
   1.350  //---------------------------dependence_graph---------------------------
   1.351  // Construct dependency graph.
   1.352  // Add dependence edges to load/store nodes for memory dependence
   1.353 @@ -488,9 +609,13 @@
   1.354  bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
   1.355  
   1.356    // Do not use superword for non-primitives
   1.357 -  if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) ||
   1.358 -     (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type())))
   1.359 +  BasicType bt1 = velt_basic_type(s1);
   1.360 +  BasicType bt2 = velt_basic_type(s2);
   1.361 +  if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
   1.362      return false;
   1.363 +  if (Matcher::max_vector_size(bt1) < 2) {
   1.364 +    return false; // No vectors for this type
   1.365 +  }
   1.366  
   1.367    if (isomorphic(s1, s2)) {
   1.368      if (independent(s1, s2)) {
   1.369 @@ -552,7 +677,7 @@
   1.370    if (s1->Opcode() != s2->Opcode()) return false;
   1.371    if (s1->req() != s2->req()) return false;
   1.372    if (s1->in(0) != s2->in(0)) return false;
   1.373 -  if (velt_type(s1) != velt_type(s2)) return false;
   1.374 +  if (!same_velt_type(s1, s2)) return false;
   1.375    return true;
   1.376  }
   1.377  
   1.378 @@ -595,14 +720,16 @@
   1.379  //------------------------------set_alignment---------------------------
   1.380  void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
   1.381    set_alignment(s1, align);
   1.382 -  set_alignment(s2, align + data_size(s1));
   1.383 +  if (align == top_align || align == bottom_align) {
   1.384 +    set_alignment(s2, align);
   1.385 +  } else {
   1.386 +    set_alignment(s2, align + data_size(s1));
   1.387 +  }
   1.388  }
   1.389  
   1.390  //------------------------------data_size---------------------------
   1.391  int SuperWord::data_size(Node* s) {
   1.392 -  const Type* t = velt_type(s);
   1.393 -  BasicType  bt = t->array_element_basic_type();
   1.394 -  int bsize = type2aelembytes(bt);
   1.395 +  int bsize = type2aelembytes(velt_basic_type(s));
   1.396    assert(bsize != 0, "valid size");
   1.397    return bsize;
   1.398  }
   1.399 @@ -631,9 +758,9 @@
   1.400  //------------------------------follow_use_defs---------------------------
   1.401  // Extend the packset by visiting operand definitions of nodes in pack p
   1.402  bool SuperWord::follow_use_defs(Node_List* p) {
   1.403 +  assert(p->size() == 2, "just checking");
   1.404    Node* s1 = p->at(0);
   1.405    Node* s2 = p->at(1);
   1.406 -  assert(p->size() == 2, "just checking");
   1.407    assert(s1->req() == s2->req(), "just checking");
   1.408    assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
   1.409  
   1.410 @@ -718,7 +845,12 @@
   1.411      for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
   1.412      for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
   1.413      if (i1 != i2) {
   1.414 -      return false;
   1.415 +      if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
   1.416 +        // Further analysis relies on operands position matching.
   1.417 +        u2->swap_edges(i1, i2);
   1.418 +      } else {
   1.419 +        return false;
   1.420 +      }
   1.421      }
   1.422    } while (i1 < ct);
   1.423    return true;
   1.424 @@ -727,7 +859,7 @@
   1.425  //------------------------------est_savings---------------------------
   1.426  // Estimate the savings from executing s1 and s2 as a pack
   1.427  int SuperWord::est_savings(Node* s1, Node* s2) {
   1.428 -  int save = 2 - 1; // 2 operations per instruction in packed form
   1.429 +  int save_in = 2 - 1; // 2 operations per instruction in packed form
   1.430  
   1.431    // inputs
   1.432    for (uint i = 1; i < s1->req(); i++) {
   1.433 @@ -735,17 +867,18 @@
   1.434      Node* x2 = s2->in(i);
   1.435      if (x1 != x2) {
   1.436        if (are_adjacent_refs(x1, x2)) {
   1.437 -        save += adjacent_profit(x1, x2);
   1.438 +        save_in += adjacent_profit(x1, x2);
   1.439        } else if (!in_packset(x1, x2)) {
   1.440 -        save -= pack_cost(2);
   1.441 +        save_in -= pack_cost(2);
   1.442        } else {
   1.443 -        save += unpack_cost(2);
   1.444 +        save_in += unpack_cost(2);
   1.445        }
   1.446      }
   1.447    }
   1.448  
   1.449    // uses of result
   1.450    uint ct = 0;
   1.451 +  int save_use = 0;
   1.452    for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
   1.453      Node* s1_use = s1->fast_out(i);
   1.454      for (int j = 0; j < _packset.length(); j++) {
   1.455 @@ -756,7 +889,7 @@
   1.456            if (p->at(p->size()-1) == s2_use) {
   1.457              ct++;
   1.458              if (are_adjacent_refs(s1_use, s2_use)) {
   1.459 -              save += adjacent_profit(s1_use, s2_use);
   1.460 +              save_use += adjacent_profit(s1_use, s2_use);
   1.461              }
   1.462            }
   1.463          }
   1.464 @@ -764,10 +897,10 @@
   1.465      }
   1.466    }
   1.467  
   1.468 -  if (ct < s1->outcnt()) save += unpack_cost(1);
   1.469 -  if (ct < s2->outcnt()) save += unpack_cost(1);
   1.470 +  if (ct < s1->outcnt()) save_use += unpack_cost(1);
   1.471 +  if (ct < s2->outcnt()) save_use += unpack_cost(1);
   1.472  
   1.473 -  return save;
   1.474 +  return MAX2(save_in, save_use);
   1.475  }
   1.476  
   1.477  //------------------------------costs---------------------------
   1.478 @@ -778,8 +911,9 @@
   1.479  //------------------------------combine_packs---------------------------
   1.480  // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
   1.481  void SuperWord::combine_packs() {
   1.482 -  bool changed;
   1.483 -  do {
   1.484 +  bool changed = true;
   1.485 +  // Combine packs regardless max vector size.
   1.486 +  while (changed) {
   1.487      changed = false;
   1.488      for (int i = 0; i < _packset.length(); i++) {
   1.489        Node_List* p1 = _packset.at(i);
   1.490 @@ -787,6 +921,7 @@
   1.491        for (int j = 0; j < _packset.length(); j++) {
   1.492          Node_List* p2 = _packset.at(j);
   1.493          if (p2 == NULL) continue;
   1.494 +        if (i == j) continue;
   1.495          if (p1->at(p1->size()-1) == p2->at(0)) {
   1.496            for (uint k = 1; k < p2->size(); k++) {
   1.497              p1->push(p2->at(k));
   1.498 @@ -796,8 +931,39 @@
   1.499          }
   1.500        }
   1.501      }
   1.502 -  } while (changed);
   1.503 +  }
   1.504  
   1.505 +  // Split packs which have size greater then max vector size.
   1.506 +  for (int i = 0; i < _packset.length(); i++) {
   1.507 +    Node_List* p1 = _packset.at(i);
   1.508 +    if (p1 != NULL) {
   1.509 +      BasicType bt = velt_basic_type(p1->at(0));
   1.510 +      uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
   1.511 +      assert(is_power_of_2(max_vlen), "sanity");
   1.512 +      uint psize = p1->size();
   1.513 +      if (!is_power_of_2(psize)) {
   1.514 +        // Skip pack which can't be vector.
   1.515 +        // case1: for(...) { a[i] = i; }    elements values are different (i+x)
   1.516 +        // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
   1.517 +        _packset.at_put(i, NULL);
   1.518 +        continue;
   1.519 +      }
   1.520 +      if (psize > max_vlen) {
   1.521 +        Node_List* pack = new Node_List();
   1.522 +        for (uint j = 0; j < psize; j++) {
   1.523 +          pack->push(p1->at(j));
   1.524 +          if (pack->size() >= max_vlen) {
   1.525 +            assert(is_power_of_2(pack->size()), "sanity");
   1.526 +            _packset.append(pack);
   1.527 +            pack = new Node_List();
   1.528 +          }
   1.529 +        }
   1.530 +        _packset.at_put(i, NULL);
   1.531 +      }
   1.532 +    }
   1.533 +  }
   1.534 +
   1.535 +  // Compress list.
   1.536    for (int i = _packset.length() - 1; i >= 0; i--) {
   1.537      Node_List* p1 = _packset.at(i);
   1.538      if (p1 == NULL) {
   1.539 @@ -880,8 +1046,7 @@
   1.540  // Can code be generated for pack p?
   1.541  bool SuperWord::implemented(Node_List* p) {
   1.542    Node* p0 = p->at(0);
   1.543 -  int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0));
   1.544 -  return vopc > 0 && Matcher::has_match_rule(vopc);
   1.545 +  return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
   1.546  }
   1.547  
   1.548  //------------------------------profitable---------------------------
   1.549 @@ -939,36 +1104,36 @@
   1.550  }
   1.551  
   1.552  //-------------------------------remove_and_insert-------------------
   1.553 -//remove "current" from its current position in the memory graph and insert
   1.554 -//it after the appropriate insertion point (lip or uip)
   1.555 +// Remove "current" from its current position in the memory graph and insert
   1.556 +// it after the appropriate insertion point (lip or uip).
   1.557  void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
   1.558                                    Node *uip, Unique_Node_List &sched_before) {
   1.559    Node* my_mem = current->in(MemNode::Memory);
   1.560 -  _igvn.rehash_node_delayed(current);
   1.561 -  _igvn.hash_delete(my_mem);
   1.562 +  bool sched_up = sched_before.member(current);
   1.563  
   1.564 -  //remove current_store from its current position in the memmory graph
   1.565 +  // remove current_store from its current position in the memmory graph
   1.566    for (DUIterator i = current->outs(); current->has_out(i); i++) {
   1.567      Node* use = current->out(i);
   1.568      if (use->is_Mem()) {
   1.569        assert(use->in(MemNode::Memory) == current, "must be");
   1.570 -      _igvn.rehash_node_delayed(use);
   1.571        if (use == prev) { // connect prev to my_mem
   1.572 -        use->set_req(MemNode::Memory, my_mem);
   1.573 +          _igvn.replace_input_of(use, MemNode::Memory, my_mem);
   1.574 +          --i; //deleted this edge; rescan position
   1.575        } else if (sched_before.member(use)) {
   1.576 -        _igvn.hash_delete(uip);
   1.577 -        use->set_req(MemNode::Memory, uip);
   1.578 +        if (!sched_up) { // Will be moved together with current
   1.579 +          _igvn.replace_input_of(use, MemNode::Memory, uip);
   1.580 +          --i; //deleted this edge; rescan position
   1.581 +        }
   1.582        } else {
   1.583 -        _igvn.hash_delete(lip);
   1.584 -        use->set_req(MemNode::Memory, lip);
   1.585 +        if (sched_up) { // Will be moved together with current
   1.586 +          _igvn.replace_input_of(use, MemNode::Memory, lip);
   1.587 +          --i; //deleted this edge; rescan position
   1.588 +        }
   1.589        }
   1.590 -      --i; //deleted this edge; rescan position
   1.591      }
   1.592    }
   1.593  
   1.594 -  bool sched_up = sched_before.member(current);
   1.595    Node *insert_pt =  sched_up ?  uip : lip;
   1.596 -  _igvn.hash_delete(insert_pt);
   1.597  
   1.598    // all uses of insert_pt's memory state should use current's instead
   1.599    for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
   1.600 @@ -988,7 +1153,7 @@
   1.601    }
   1.602  
   1.603    //connect current to insert_pt
   1.604 -  current->set_req(MemNode::Memory, insert_pt);
   1.605 +  _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
   1.606  }
   1.607  
   1.608  //------------------------------co_locate_pack----------------------------------
   1.609 @@ -1025,7 +1190,7 @@
   1.610          if (use->is_Mem() && use != previous)
   1.611            memops.push(use);
   1.612        }
   1.613 -      if(current == first) break;
   1.614 +      if (current == first) break;
   1.615        previous = current;
   1.616        current  = current->in(MemNode::Memory)->as_Mem();
   1.617      }
   1.618 @@ -1038,27 +1203,37 @@
   1.619            Node *s2 = memops.at(j);
   1.620            if (!independent(s1, s2)) {
   1.621              if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
   1.622 -              schedule_before_pack.push(s1); //s1 must be scheduled before
   1.623 +              schedule_before_pack.push(s1); // s1 must be scheduled before
   1.624                Node_List* mem_pk = my_pack(s1);
   1.625                if (mem_pk != NULL) {
   1.626                  for (uint ii = 0; ii < mem_pk->size(); ii++) {
   1.627 -                  Node* s = mem_pk->at(ii); // follow partner
   1.628 +                  Node* s = mem_pk->at(ii);  // follow partner
   1.629                    if (memops.member(s) && !schedule_before_pack.member(s))
   1.630                      schedule_before_pack.push(s);
   1.631                  }
   1.632                }
   1.633 +              break;
   1.634              }
   1.635            }
   1.636          }
   1.637        }
   1.638      }
   1.639  
   1.640 +    Node*    upper_insert_pt = first->in(MemNode::Memory);
   1.641 +    // Following code moves loads connected to upper_insert_pt below aliased stores.
   1.642 +    // Collect such loads here and reconnect them back to upper_insert_pt later.
   1.643 +    memops.clear();
   1.644 +    for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
   1.645 +      Node* use = upper_insert_pt->out(i);
   1.646 +      if (!use->is_Store())
   1.647 +        memops.push(use);
   1.648 +    }
   1.649 +
   1.650      MemNode* lower_insert_pt = last;
   1.651 -    Node*    upper_insert_pt = first->in(MemNode::Memory);
   1.652      previous                 = last; //previous store in pk
   1.653      current                  = last->in(MemNode::Memory)->as_Mem();
   1.654  
   1.655 -    //start scheduling from "last" to "first"
   1.656 +    // start scheduling from "last" to "first"
   1.657      while (true) {
   1.658        assert(in_bb(current), "stay in block");
   1.659        assert(in_pack(previous, pk), "previous stays in pack");
   1.660 @@ -1066,16 +1241,13 @@
   1.661  
   1.662        if (in_pack(current, pk)) {
   1.663          // Forward users of my memory state (except "previous) to my input memory state
   1.664 -        _igvn.hash_delete(current);
   1.665          for (DUIterator i = current->outs(); current->has_out(i); i++) {
   1.666            Node* use = current->out(i);
   1.667            if (use->is_Mem() && use != previous) {
   1.668              assert(use->in(MemNode::Memory) == current, "must be");
   1.669              if (schedule_before_pack.member(use)) {
   1.670 -              _igvn.hash_delete(upper_insert_pt);
   1.671                _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
   1.672              } else {
   1.673 -              _igvn.hash_delete(lower_insert_pt);
   1.674                _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
   1.675              }
   1.676              --i; // deleted this edge; rescan position
   1.677 @@ -1089,6 +1261,14 @@
   1.678        if (current == first) break;
   1.679        current = my_mem->as_Mem();
   1.680      } // end while
   1.681 +
   1.682 +    // Reconnect loads back to upper_insert_pt.
   1.683 +    for (uint i = 0; i < memops.size(); i++) {
   1.684 +      Node *ld = memops.at(i);
   1.685 +      if (ld->in(MemNode::Memory) != upper_insert_pt) {
   1.686 +        _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
   1.687 +      }
   1.688 +    }
   1.689    } else if (pk->at(0)->is_Load()) { //load
   1.690      // all loads in the pack should have the same memory state. By default,
   1.691      // we use the memory state of the last load. However, if any load could
   1.692 @@ -1149,35 +1329,30 @@
   1.693        Node* vn = NULL;
   1.694        Node* low_adr = p->at(0);
   1.695        Node* first   = executed_first(p);
   1.696 +      int   opc = n->Opcode();
   1.697        if (n->is_Load()) {
   1.698 -        int   opc = n->Opcode();
   1.699          Node* ctl = n->in(MemNode::Control);
   1.700          Node* mem = first->in(MemNode::Memory);
   1.701          Node* adr = low_adr->in(MemNode::Address);
   1.702          const TypePtr* atyp = n->adr_type();
   1.703 -        vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen);
   1.704 -
   1.705 +        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
   1.706        } else if (n->is_Store()) {
   1.707          // Promote value to be stored to vector
   1.708          Node* val = vector_opd(p, MemNode::ValueIn);
   1.709 -
   1.710 -        int   opc = n->Opcode();
   1.711          Node* ctl = n->in(MemNode::Control);
   1.712          Node* mem = first->in(MemNode::Memory);
   1.713          Node* adr = low_adr->in(MemNode::Address);
   1.714          const TypePtr* atyp = n->adr_type();
   1.715 -        vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
   1.716 -
   1.717 +        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
   1.718        } else if (n->req() == 3) {
   1.719          // Promote operands to vector
   1.720          Node* in1 = vector_opd(p, 1);
   1.721          Node* in2 = vector_opd(p, 2);
   1.722 -        vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n));
   1.723 -
   1.724 +        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
   1.725        } else {
   1.726          ShouldNotReachHere();
   1.727        }
   1.728 -
   1.729 +      assert(vn != NULL, "sanity");
   1.730        _phase->_igvn.register_new_node_with_optimizer(vn);
   1.731        _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
   1.732        for (uint j = 0; j < p->size(); j++) {
   1.733 @@ -1185,6 +1360,12 @@
   1.734          _igvn.replace_node(pm, vn);
   1.735        }
   1.736        _igvn._worklist.push(vn);
   1.737 +#ifdef ASSERT
   1.738 +      if (TraceSuperWord) {
   1.739 +        tty->print("new Vector node: ");
   1.740 +        vn->dump();
   1.741 +      }
   1.742 +#endif
   1.743      }
   1.744    }
   1.745  }
   1.746 @@ -1207,10 +1388,10 @@
   1.747    }
   1.748  
   1.749    if (same_opd) {
   1.750 -    if (opd->is_Vector() || opd->is_VectorLoad()) {
   1.751 +    if (opd->is_Vector() || opd->is_LoadVector()) {
   1.752        return opd; // input is matching vector
   1.753      }
   1.754 -    assert(!opd->is_VectorStore(), "such vector is not expected here");
   1.755 +    assert(!opd->is_StoreVector(), "such vector is not expected here");
   1.756      // Convert scalar input to vector with the same number of elements as
   1.757      // p0's vector. Use p0's type because size of operand's container in
   1.758      // vector should match p0's size regardless operand's size.
   1.759 @@ -1219,12 +1400,18 @@
   1.760  
   1.761      _phase->_igvn.register_new_node_with_optimizer(vn);
   1.762      _phase->set_ctrl(vn, _phase->get_ctrl(opd));
   1.763 +#ifdef ASSERT
   1.764 +    if (TraceSuperWord) {
   1.765 +      tty->print("new Vector node: ");
   1.766 +      vn->dump();
   1.767 +    }
   1.768 +#endif
   1.769      return vn;
   1.770    }
   1.771  
   1.772    // Insert pack operation
   1.773 -  const Type* p0_t = velt_type(p0);
   1.774 -  PackNode* pk = PackNode::make(_phase->C, opd, p0_t);
   1.775 +  BasicType bt = velt_basic_type(p0);
   1.776 +  PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
   1.777    DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
   1.778  
   1.779    for (uint i = 1; i < vlen; i++) {
   1.780 @@ -1232,10 +1419,16 @@
   1.781      Node* in = pi->in(opd_idx);
   1.782      assert(my_pack(in) == NULL, "Should already have been unpacked");
   1.783      assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
   1.784 -    pk->add_opd(in);
   1.785 +    pk->add_opd(i, in);
   1.786    }
   1.787    _phase->_igvn.register_new_node_with_optimizer(pk);
   1.788    _phase->set_ctrl(pk, _phase->get_ctrl(opd));
   1.789 +#ifdef ASSERT
   1.790 +    if (TraceSuperWord) {
   1.791 +      tty->print("new Pack node: ");
   1.792 +      pk->dump();
   1.793 +    }
   1.794 +#endif
   1.795    return pk;
   1.796  }
   1.797  
   1.798 @@ -1273,16 +1466,15 @@
   1.799      // Insert extract operation
   1.800      _igvn.hash_delete(def);
   1.801      int def_pos = alignment(def) / data_size(def);
   1.802 -    const Type* def_t = velt_type(def);
   1.803  
   1.804 -    Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
   1.805 +    Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
   1.806      _phase->_igvn.register_new_node_with_optimizer(ex);
   1.807      _phase->set_ctrl(ex, _phase->get_ctrl(def));
   1.808      _igvn.replace_input_of(use, idx, ex);
   1.809      _igvn._worklist.push(def);
   1.810  
   1.811      bb_insert_after(ex, bb_idx(def));
   1.812 -    set_velt_type(ex, def_t);
   1.813 +    set_velt_type(ex, velt_type(def));
   1.814    }
   1.815  }
   1.816  
   1.817 @@ -1509,10 +1701,7 @@
   1.818    // Initial type
   1.819    for (int i = 0; i < _block.length(); i++) {
   1.820      Node* n = _block.at(i);
   1.821 -    const Type* t  = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type())
   1.822 -                                 : _igvn.type(n);
   1.823 -    const Type* vt = container_type(t);
   1.824 -    set_velt_type(n, vt);
   1.825 +    set_velt_type(n, container_type(n));
   1.826    }
   1.827  
   1.828    // Propagate narrowed type backwards through operations
   1.829 @@ -1543,7 +1732,7 @@
   1.830              bool same_type = true;
   1.831              for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
   1.832                Node *use = in->fast_out(k);
   1.833 -              if (!in_bb(use) || velt_type(use) != vt) {
   1.834 +              if (!in_bb(use) || !same_velt_type(use, n)) {
   1.835                  same_type = false;
   1.836                  break;
   1.837                }
   1.838 @@ -1575,20 +1764,24 @@
   1.839    if (!p.valid()) {
   1.840      return bottom_align;
   1.841    }
   1.842 +  int vw = vector_width_in_bytes(velt_basic_type(s));
   1.843 +  if (vw < 2) {
   1.844 +    return bottom_align; // No vectors for this type
   1.845 +  }
   1.846    int offset  = p.offset_in_bytes();
   1.847    offset     += iv_adjust_in_bytes;
   1.848 -  int off_rem = offset % vector_width_in_bytes();
   1.849 -  int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes();
   1.850 +  int off_rem = offset % vw;
   1.851 +  int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
   1.852    return off_mod;
   1.853  }
   1.854  
   1.855  //---------------------------container_type---------------------------
   1.856  // Smallest type containing range of values
   1.857 -const Type* SuperWord::container_type(const Type* t) {
   1.858 -  const Type* tp = t->make_ptr();
   1.859 -  if (tp && tp->isa_aryptr()) {
   1.860 -    t = tp->is_aryptr()->elem();
   1.861 +const Type* SuperWord::container_type(Node* n) {
   1.862 +  if (n->is_Mem()) {
   1.863 +    return Type::get_const_basic_type(n->as_Mem()->memory_type());
   1.864    }
   1.865 +  const Type* t = _igvn.type(n);
   1.866    if (t->basic_type() == T_INT) {
   1.867      if (t->higher_equal(TypeInt::BOOL))  return TypeInt::BOOL;
   1.868      if (t->higher_equal(TypeInt::BYTE))  return TypeInt::BYTE;
   1.869 @@ -1599,11 +1792,22 @@
   1.870    return t;
   1.871  }
   1.872  
   1.873 +bool SuperWord::same_velt_type(Node* n1, Node* n2) {
   1.874 +  const Type* vt1 = velt_type(n1);
   1.875 +  const Type* vt2 = velt_type(n1);
   1.876 +  if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
   1.877 +    // Compare vectors element sizes for integer types.
   1.878 +    return data_size(n1) == data_size(n2);
   1.879 +  }
   1.880 +  return vt1 == vt2;
   1.881 +}
   1.882 +
   1.883  //-------------------------vector_opd_range-----------------------
   1.884  // (Start, end] half-open range defining which operands are vector
   1.885  void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
   1.886    switch (n->Opcode()) {
   1.887 -  case Op_LoadB:   case Op_LoadUS:
   1.888 +  case Op_LoadB:   case Op_LoadUB:
   1.889 +  case Op_LoadS:   case Op_LoadUS:
   1.890    case Op_LoadI:   case Op_LoadL:
   1.891    case Op_LoadF:   case Op_LoadD:
   1.892    case Op_LoadP:
   1.893 @@ -1721,6 +1925,7 @@
   1.894    assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
   1.895  
   1.896    SWPointer align_to_ref_p(align_to_ref, this);
   1.897 +  assert(align_to_ref_p.valid(), "sanity");
   1.898  
   1.899    // Given:
   1.900    //     lim0 == original pre loop limit
   1.901 @@ -1773,10 +1978,12 @@
   1.902    //     N = (V - (e - lim0)) % V
   1.903    //     lim = lim0 - (V - (e - lim0)) % V
   1.904  
   1.905 +  int vw = vector_width_in_bytes(velt_basic_type(align_to_ref));
   1.906 +  assert(vw > 1, "sanity");
   1.907    int stride   = iv_stride();
   1.908    int scale    = align_to_ref_p.scale_in_bytes();
   1.909    int elt_size = align_to_ref_p.memory_size();
   1.910 -  int v_align  = vector_width_in_bytes() / elt_size;
   1.911 +  int v_align  = vw / elt_size;
   1.912    int k        = align_to_ref_p.offset_in_bytes() / elt_size;
   1.913  
   1.914    Node *kn   = _igvn.intcon(k);
   1.915 @@ -1796,6 +2003,25 @@
   1.916      _phase->_igvn.register_new_node_with_optimizer(e);
   1.917      _phase->set_ctrl(e, pre_ctrl);
   1.918    }
   1.919 +  if (vw > ObjectAlignmentInBytes) {
   1.920 +    // incorporate base e +/- base && Mask >>> log2(elt)
   1.921 +    Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
   1.922 +    Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
   1.923 +    _phase->_igvn.register_new_node_with_optimizer(xbase);
   1.924 +    Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
   1.925 +    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
   1.926 +#ifdef _LP64
   1.927 +    masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
   1.928 +    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
   1.929 +#endif
   1.930 +    Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
   1.931 +    Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
   1.932 +    _phase->_igvn.register_new_node_with_optimizer(bref);
   1.933 +    _phase->set_ctrl(bref, pre_ctrl);
   1.934 +    e = new (_phase->C, 3) AddINode(e, bref);
   1.935 +    _phase->_igvn.register_new_node_with_optimizer(e);
   1.936 +    _phase->set_ctrl(e, pre_ctrl);
   1.937 +  }
   1.938  
   1.939    // compute e +/- lim0
   1.940    if (scale < 0) {

mercurial