duke@435: /* duke@435: * Copyright 1997-2007 Sun Microsystems, Inc. All Rights Reserved. duke@435: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. duke@435: * duke@435: * This code is free software; you can redistribute it and/or modify it duke@435: * under the terms of the GNU General Public License version 2 only, as duke@435: * published by the Free Software Foundation. duke@435: * duke@435: * This code is distributed in the hope that it will be useful, but WITHOUT duke@435: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or duke@435: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License duke@435: * version 2 for more details (a copy is included in the LICENSE file that duke@435: * accompanied this code). duke@435: * duke@435: * You should have received a copy of the GNU General Public License version duke@435: * 2 along with this work; if not, write to the Free Software Foundation, duke@435: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. duke@435: * duke@435: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, duke@435: * CA 95054 USA or visit www.sun.com if you need additional information or duke@435: * have any questions. duke@435: * duke@435: */ duke@435: duke@435: // Portions of code courtesy of Clifford Click duke@435: duke@435: // Optimization - Graph Style duke@435: duke@435: #include "incls/_precompiled.incl" duke@435: #include "incls/_gcm.cpp.incl" duke@435: duke@435: //----------------------------schedule_node_into_block------------------------- duke@435: // Insert node n into block b. Look for projections of n and make sure they duke@435: // are in b also. duke@435: void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) { duke@435: // Set basic block of n, Add n to b, duke@435: _bbs.map(n->_idx, b); duke@435: b->add_inst(n); duke@435: duke@435: // After Matching, nearly any old Node may have projections trailing it. duke@435: // These are usually machine-dependent flags. In any case, they might duke@435: // float to another block below this one. Move them up. duke@435: for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { duke@435: Node* use = n->fast_out(i); duke@435: if (use->is_Proj()) { duke@435: Block* buse = _bbs[use->_idx]; duke@435: if (buse != b) { // In wrong block? duke@435: if (buse != NULL) duke@435: buse->find_remove(use); // Remove from wrong block duke@435: _bbs.map(use->_idx, b); // Re-insert in this block duke@435: b->add_inst(use); duke@435: } duke@435: } duke@435: } duke@435: } duke@435: duke@435: duke@435: //------------------------------schedule_pinned_nodes-------------------------- duke@435: // Set the basic block for Nodes pinned into blocks duke@435: void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) { duke@435: // Allocate node stack of size C->unique()+8 to avoid frequent realloc duke@435: GrowableArray spstack(C->unique()+8); duke@435: spstack.push(_root); duke@435: while ( spstack.is_nonempty() ) { duke@435: Node *n = spstack.pop(); duke@435: if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited duke@435: if( n->pinned() && !_bbs.lookup(n->_idx) ) { // Pinned? Nail it down! duke@435: Node *input = n->in(0); duke@435: assert( input, "pinned Node must have Control" ); duke@435: while( !input->is_block_start() ) duke@435: input = input->in(0); duke@435: Block *b = _bbs[input->_idx]; // Basic block of controlling input duke@435: schedule_node_into_block(n, b); duke@435: } duke@435: for( int i = n->req() - 1; i >= 0; --i ) { // For all inputs duke@435: if( n->in(i) != NULL ) duke@435: spstack.push(n->in(i)); duke@435: } duke@435: } duke@435: } duke@435: } duke@435: duke@435: #ifdef ASSERT duke@435: // Assert that new input b2 is dominated by all previous inputs. duke@435: // Check this by by seeing that it is dominated by b1, the deepest duke@435: // input observed until b2. duke@435: static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) { duke@435: if (b1 == NULL) return; duke@435: assert(b1->_dom_depth < b2->_dom_depth, "sanity"); duke@435: Block* tmp = b2; duke@435: while (tmp != b1 && tmp != NULL) { duke@435: tmp = tmp->_idom; duke@435: } duke@435: if (tmp != b1) { duke@435: // Detected an unschedulable graph. Print some nice stuff and die. duke@435: tty->print_cr("!!! Unschedulable graph !!!"); duke@435: for (uint j=0; jlen(); j++) { // For all inputs duke@435: Node* inn = n->in(j); // Get input duke@435: if (inn == NULL) continue; // Ignore NULL, missing inputs duke@435: Block* inb = bbs[inn->_idx]; duke@435: tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order, duke@435: inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth); duke@435: inn->dump(); duke@435: } duke@435: tty->print("Failing node: "); duke@435: n->dump(); duke@435: assert(false, "unscheduable graph"); duke@435: } duke@435: } duke@435: #endif duke@435: duke@435: static Block* find_deepest_input(Node* n, Block_Array &bbs) { duke@435: // Find the last input dominated by all other inputs. duke@435: Block* deepb = NULL; // Deepest block so far duke@435: int deepb_dom_depth = 0; duke@435: for (uint k = 0; k < n->len(); k++) { // For all inputs duke@435: Node* inn = n->in(k); // Get input duke@435: if (inn == NULL) continue; // Ignore NULL, missing inputs duke@435: Block* inb = bbs[inn->_idx]; duke@435: assert(inb != NULL, "must already have scheduled this input"); duke@435: if (deepb_dom_depth < (int) inb->_dom_depth) { duke@435: // The new inb must be dominated by the previous deepb. duke@435: // The various inputs must be linearly ordered in the dom duke@435: // tree, or else there will not be a unique deepest block. duke@435: DEBUG_ONLY(assert_dom(deepb, inb, n, bbs)); duke@435: deepb = inb; // Save deepest block duke@435: deepb_dom_depth = deepb->_dom_depth; duke@435: } duke@435: } duke@435: assert(deepb != NULL, "must be at least one input to n"); duke@435: return deepb; duke@435: } duke@435: duke@435: duke@435: //------------------------------schedule_early--------------------------------- duke@435: // Find the earliest Block any instruction can be placed in. Some instructions duke@435: // are pinned into Blocks. Unpinned instructions can appear in last block in duke@435: // which all their inputs occur. duke@435: bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) { duke@435: // Allocate stack with enough space to avoid frequent realloc duke@435: Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats duke@435: // roots.push(_root); _root will be processed among C->top() inputs duke@435: roots.push(C->top()); duke@435: visited.set(C->top()->_idx); duke@435: duke@435: while (roots.size() != 0) { duke@435: // Use local variables nstack_top_n & nstack_top_i to cache values duke@435: // on stack's top. duke@435: Node *nstack_top_n = roots.pop(); duke@435: uint nstack_top_i = 0; duke@435: //while_nstack_nonempty: duke@435: while (true) { duke@435: // Get parent node and next input's index from stack's top. duke@435: Node *n = nstack_top_n; duke@435: uint i = nstack_top_i; duke@435: duke@435: if (i == 0) { duke@435: // Special control input processing. duke@435: // While I am here, go ahead and look for Nodes which are taking control duke@435: // from a is_block_proj Node. After I inserted RegionNodes to make proper duke@435: // blocks, the control at a is_block_proj more properly comes from the duke@435: // Region being controlled by the block_proj Node. duke@435: const Node *in0 = n->in(0); duke@435: if (in0 != NULL) { // Control-dependent? duke@435: const Node *p = in0->is_block_proj(); duke@435: if (p != NULL && p != n) { // Control from a block projection? duke@435: // Find trailing Region duke@435: Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block duke@435: uint j = 0; duke@435: if (pb->_num_succs != 1) { // More then 1 successor? duke@435: // Search for successor duke@435: uint max = pb->_nodes.size(); duke@435: assert( max > 1, "" ); duke@435: uint start = max - pb->_num_succs; duke@435: // Find which output path belongs to projection duke@435: for (j = start; j < max; j++) { duke@435: if( pb->_nodes[j] == in0 ) duke@435: break; duke@435: } duke@435: assert( j < max, "must find" ); duke@435: // Change control to match head of successor basic block duke@435: j -= start; duke@435: } duke@435: n->set_req(0, pb->_succs[j]->head()); duke@435: } duke@435: } else { // n->in(0) == NULL duke@435: if (n->req() == 1) { // This guy is a constant with NO inputs? duke@435: n->set_req(0, _root); duke@435: } duke@435: } duke@435: } duke@435: duke@435: // First, visit all inputs and force them to get a block. If an duke@435: // input is already in a block we quit following inputs (to avoid duke@435: // cycles). Instead we put that Node on a worklist to be handled duke@435: // later (since IT'S inputs may not have a block yet). duke@435: bool done = true; // Assume all n's inputs will be processed duke@435: while (i < n->len()) { // For all inputs duke@435: Node *in = n->in(i); // Get input duke@435: ++i; duke@435: if (in == NULL) continue; // Ignore NULL, missing inputs duke@435: int is_visited = visited.test_set(in->_idx); duke@435: if (!_bbs.lookup(in->_idx)) { // Missing block selection? duke@435: if (is_visited) { duke@435: // assert( !visited.test(in->_idx), "did not schedule early" ); duke@435: return false; duke@435: } duke@435: nstack.push(n, i); // Save parent node and next input's index. duke@435: nstack_top_n = in; // Process current input now. duke@435: nstack_top_i = 0; duke@435: done = false; // Not all n's inputs processed. duke@435: break; // continue while_nstack_nonempty; duke@435: } else if (!is_visited) { // Input not yet visited? duke@435: roots.push(in); // Visit this guy later, using worklist duke@435: } duke@435: } duke@435: if (done) { duke@435: // All of n's inputs have been processed, complete post-processing. duke@435: duke@435: // Some instructions are pinned into a block. These include Region, duke@435: // Phi, Start, Return, and other control-dependent instructions and duke@435: // any projections which depend on them. duke@435: if (!n->pinned()) { duke@435: // Set earliest legal block. duke@435: _bbs.map(n->_idx, find_deepest_input(n, _bbs)); duke@435: } duke@435: duke@435: if (nstack.is_empty()) { duke@435: // Finished all nodes on stack. duke@435: // Process next node on the worklist 'roots'. duke@435: break; duke@435: } duke@435: // Get saved parent node and next input's index. duke@435: nstack_top_n = nstack.node(); duke@435: nstack_top_i = nstack.index(); duke@435: nstack.pop(); duke@435: } // if (done) duke@435: } // while (true) duke@435: } // while (roots.size() != 0) duke@435: return true; duke@435: } duke@435: duke@435: //------------------------------dom_lca---------------------------------------- duke@435: // Find least common ancestor in dominator tree duke@435: // LCA is a current notion of LCA, to be raised above 'this'. duke@435: // As a convenient boundary condition, return 'this' if LCA is NULL. duke@435: // Find the LCA of those two nodes. duke@435: Block* Block::dom_lca(Block* LCA) { duke@435: if (LCA == NULL || LCA == this) return this; duke@435: duke@435: Block* anc = this; duke@435: while (anc->_dom_depth > LCA->_dom_depth) duke@435: anc = anc->_idom; // Walk up till anc is as high as LCA duke@435: duke@435: while (LCA->_dom_depth > anc->_dom_depth) duke@435: LCA = LCA->_idom; // Walk up till LCA is as high as anc duke@435: duke@435: while (LCA != anc) { // Walk both up till they are the same duke@435: LCA = LCA->_idom; duke@435: anc = anc->_idom; duke@435: } duke@435: duke@435: return LCA; duke@435: } duke@435: duke@435: //--------------------------raise_LCA_above_use-------------------------------- duke@435: // We are placing a definition, and have been given a def->use edge. duke@435: // The definition must dominate the use, so move the LCA upward in the duke@435: // dominator tree to dominate the use. If the use is a phi, adjust duke@435: // the LCA only with the phi input paths which actually use this def. duke@435: static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) { duke@435: Block* buse = bbs[use->_idx]; duke@435: if (buse == NULL) return LCA; // Unused killing Projs have no use block duke@435: if (!use->is_Phi()) return buse->dom_lca(LCA); duke@435: uint pmax = use->req(); // Number of Phi inputs duke@435: // Why does not this loop just break after finding the matching input to duke@435: // the Phi? Well...it's like this. I do not have true def-use/use-def duke@435: // chains. Means I cannot distinguish, from the def-use direction, which duke@435: // of many use-defs lead from the same use to the same def. That is, this duke@435: // Phi might have several uses of the same def. Each use appears in a duke@435: // different predecessor block. But when I enter here, I cannot distinguish duke@435: // which use-def edge I should find the predecessor block for. So I find duke@435: // them all. Means I do a little extra work if a Phi uses the same value duke@435: // more than once. duke@435: for (uint j=1; jin(j) == def) { // Found matching input? duke@435: Block* pred = bbs[buse->pred(j)->_idx]; duke@435: LCA = pred->dom_lca(LCA); duke@435: } duke@435: } duke@435: return LCA; duke@435: } duke@435: duke@435: //----------------------------raise_LCA_above_marks---------------------------- duke@435: // Return a new LCA that dominates LCA and any of its marked predecessors. duke@435: // Search all my parents up to 'early' (exclusive), looking for predecessors duke@435: // which are marked with the given index. Return the LCA (in the dom tree) duke@435: // of all marked blocks. If there are none marked, return the original duke@435: // LCA. duke@435: static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, duke@435: Block* early, Block_Array &bbs) { duke@435: Block_List worklist; duke@435: worklist.push(LCA); duke@435: while (worklist.size() > 0) { duke@435: Block* mid = worklist.pop(); duke@435: if (mid == early) continue; // stop searching here duke@435: duke@435: // Test and set the visited bit. duke@435: if (mid->raise_LCA_visited() == mark) continue; // already visited duke@435: mid->set_raise_LCA_visited(mark); duke@435: duke@435: // Don't process the current LCA, otherwise the search may terminate early duke@435: if (mid != LCA && mid->raise_LCA_mark() == mark) { duke@435: // Raise the LCA. duke@435: LCA = mid->dom_lca(LCA); duke@435: if (LCA == early) break; // stop searching everywhere duke@435: assert(early->dominates(LCA), "early is high enough"); duke@435: // Resume searching at that point, skipping intermediate levels. duke@435: worklist.push(LCA); duke@435: } else { duke@435: // Keep searching through this block's predecessors. duke@435: for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) { duke@435: Block* mid_parent = bbs[ mid->pred(j)->_idx ]; duke@435: worklist.push(mid_parent); duke@435: } duke@435: } duke@435: } duke@435: return LCA; duke@435: } duke@435: duke@435: //--------------------------memory_early_block-------------------------------- duke@435: // This is a variation of find_deepest_input, the heart of schedule_early. duke@435: // Find the "early" block for a load, if we considered only memory and duke@435: // address inputs, that is, if other data inputs were ignored. duke@435: // duke@435: // Because a subset of edges are considered, the resulting block will duke@435: // be earlier (at a shallower dom_depth) than the true schedule_early duke@435: // point of the node. We compute this earlier block as a more permissive duke@435: // site for anti-dependency insertion, but only if subsume_loads is enabled. duke@435: static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) { duke@435: Node* base; duke@435: Node* index; duke@435: Node* store = load->in(MemNode::Memory); duke@435: load->as_Mach()->memory_inputs(base, index); duke@435: duke@435: assert(base != NodeSentinel && index != NodeSentinel, duke@435: "unexpected base/index inputs"); duke@435: duke@435: Node* mem_inputs[4]; duke@435: int mem_inputs_length = 0; duke@435: if (base != NULL) mem_inputs[mem_inputs_length++] = base; duke@435: if (index != NULL) mem_inputs[mem_inputs_length++] = index; duke@435: if (store != NULL) mem_inputs[mem_inputs_length++] = store; duke@435: duke@435: // In the comparision below, add one to account for the control input, duke@435: // which may be null, but always takes up a spot in the in array. duke@435: if (mem_inputs_length + 1 < (int) load->req()) { duke@435: // This "load" has more inputs than just the memory, base and index inputs. duke@435: // For purposes of checking anti-dependences, we need to start duke@435: // from the early block of only the address portion of the instruction, duke@435: // and ignore other blocks that may have factored into the wider duke@435: // schedule_early calculation. duke@435: if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0); duke@435: duke@435: Block* deepb = NULL; // Deepest block so far duke@435: int deepb_dom_depth = 0; duke@435: for (int i = 0; i < mem_inputs_length; i++) { duke@435: Block* inb = bbs[mem_inputs[i]->_idx]; duke@435: if (deepb_dom_depth < (int) inb->_dom_depth) { duke@435: // The new inb must be dominated by the previous deepb. duke@435: // The various inputs must be linearly ordered in the dom duke@435: // tree, or else there will not be a unique deepest block. duke@435: DEBUG_ONLY(assert_dom(deepb, inb, load, bbs)); duke@435: deepb = inb; // Save deepest block duke@435: deepb_dom_depth = deepb->_dom_depth; duke@435: } duke@435: } duke@435: early = deepb; duke@435: } duke@435: duke@435: return early; duke@435: } duke@435: duke@435: //--------------------------insert_anti_dependences--------------------------- duke@435: // A load may need to witness memory that nearby stores can overwrite. duke@435: // For each nearby store, either insert an "anti-dependence" edge duke@435: // from the load to the store, or else move LCA upward to force the duke@435: // load to (eventually) be scheduled in a block above the store. duke@435: // duke@435: // Do not add edges to stores on distinct control-flow paths; duke@435: // only add edges to stores which might interfere. duke@435: // duke@435: // Return the (updated) LCA. There will not be any possibly interfering duke@435: // store between the load's "early block" and the updated LCA. duke@435: // Any stores in the updated LCA will have new precedence edges duke@435: // back to the load. The caller is expected to schedule the load duke@435: // in the LCA, in which case the precedence edges will make LCM duke@435: // preserve anti-dependences. The caller may also hoist the load duke@435: // above the LCA, if it is not the early block. duke@435: Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { duke@435: assert(load->needs_anti_dependence_check(), "must be a load of some sort"); duke@435: assert(LCA != NULL, ""); duke@435: DEBUG_ONLY(Block* LCA_orig = LCA); duke@435: duke@435: // Compute the alias index. Loads and stores with different alias indices duke@435: // do not need anti-dependence edges. duke@435: uint load_alias_idx = C->get_alias_index(load->adr_type()); duke@435: #ifdef ASSERT duke@435: if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 && duke@435: (PrintOpto || VerifyAliases || duke@435: PrintMiscellaneous && (WizardMode || Verbose))) { duke@435: // Load nodes should not consume all of memory. duke@435: // Reporting a bottom type indicates a bug in adlc. duke@435: // If some particular type of node validly consumes all of memory, duke@435: // sharpen the preceding "if" to exclude it, so we can catch bugs here. duke@435: tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory."); duke@435: load->dump(2); duke@435: if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, ""); duke@435: } duke@435: #endif duke@435: assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp), duke@435: "String compare is only known 'load' that does not conflict with any stores"); duke@435: duke@435: if (!C->alias_type(load_alias_idx)->is_rewritable()) { duke@435: // It is impossible to spoil this load by putting stores before it, duke@435: // because we know that the stores will never update the value duke@435: // which 'load' must witness. duke@435: return LCA; duke@435: } duke@435: duke@435: node_idx_t load_index = load->_idx; duke@435: duke@435: // Note the earliest legal placement of 'load', as determined by duke@435: // by the unique point in the dom tree where all memory effects duke@435: // and other inputs are first available. (Computed by schedule_early.) duke@435: // For normal loads, 'early' is the shallowest place (dom graph wise) duke@435: // to look for anti-deps between this load and any store. duke@435: Block* early = _bbs[load_index]; duke@435: duke@435: // If we are subsuming loads, compute an "early" block that only considers duke@435: // memory or address inputs. This block may be different than the duke@435: // schedule_early block in that it could be at an even shallower depth in the duke@435: // dominator tree, and allow for a broader discovery of anti-dependences. duke@435: if (C->subsume_loads()) { duke@435: early = memory_early_block(load, early, _bbs); duke@435: } duke@435: duke@435: ResourceArea *area = Thread::current()->resource_area(); duke@435: Node_List worklist_mem(area); // prior memory state to store duke@435: Node_List worklist_store(area); // possible-def to explore kvn@466: Node_List worklist_visited(area); // visited mergemem nodes duke@435: Node_List non_early_stores(area); // all relevant stores outside of early duke@435: bool must_raise_LCA = false; duke@435: duke@435: #ifdef TRACK_PHI_INPUTS duke@435: // %%% This extra checking fails because MergeMem nodes are not GVNed. duke@435: // Provide "phi_inputs" to check if every input to a PhiNode is from the duke@435: // original memory state. This indicates a PhiNode for which should not duke@435: // prevent the load from sinking. For such a block, set_raise_LCA_mark duke@435: // may be overly conservative. duke@435: // Mechanism: count inputs seen for each Phi encountered in worklist_store. duke@435: DEBUG_ONLY(GrowableArray phi_inputs(area, C->unique(),0,0)); duke@435: #endif duke@435: duke@435: // 'load' uses some memory state; look for users of the same state. duke@435: // Recurse through MergeMem nodes to the stores that use them. duke@435: duke@435: // Each of these stores is a possible definition of memory duke@435: // that 'load' needs to use. We need to force 'load' duke@435: // to occur before each such store. When the store is in duke@435: // the same block as 'load', we insert an anti-dependence duke@435: // edge load->store. duke@435: duke@435: // The relevant stores "nearby" the load consist of a tree rooted duke@435: // at initial_mem, with internal nodes of type MergeMem. duke@435: // Therefore, the branches visited by the worklist are of this form: duke@435: // initial_mem -> (MergeMem ->)* store duke@435: // The anti-dependence constraints apply only to the fringe of this tree. duke@435: duke@435: Node* initial_mem = load->in(MemNode::Memory); duke@435: worklist_store.push(initial_mem); kvn@466: worklist_visited.push(initial_mem); duke@435: worklist_mem.push(NULL); duke@435: while (worklist_store.size() > 0) { duke@435: // Examine a nearby store to see if it might interfere with our load. duke@435: Node* mem = worklist_mem.pop(); duke@435: Node* store = worklist_store.pop(); duke@435: uint op = store->Opcode(); duke@435: duke@435: // MergeMems do not directly have anti-deps. duke@435: // Treat them as internal nodes in a forward tree of memory states, duke@435: // the leaves of which are each a 'possible-def'. duke@435: if (store == initial_mem // root (exclusive) of tree we are searching duke@435: || op == Op_MergeMem // internal node of tree we are searching duke@435: ) { duke@435: mem = store; // It's not a possibly interfering store. kvn@466: if (store == initial_mem) kvn@466: initial_mem = NULL; // only process initial memory once kvn@466: duke@435: for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { duke@435: store = mem->fast_out(i); duke@435: if (store->is_MergeMem()) { duke@435: // Be sure we don't get into combinatorial problems. duke@435: // (Allow phis to be repeated; they can merge two relevant states.) kvn@466: uint j = worklist_visited.size(); kvn@466: for (; j > 0; j--) { kvn@466: if (worklist_visited.at(j-1) == store) break; duke@435: } kvn@466: if (j > 0) continue; // already on work list; do not repeat kvn@466: worklist_visited.push(store); duke@435: } duke@435: worklist_mem.push(mem); duke@435: worklist_store.push(store); duke@435: } duke@435: continue; duke@435: } duke@435: duke@435: if (op == Op_MachProj || op == Op_Catch) continue; duke@435: if (store->needs_anti_dependence_check()) continue; // not really a store duke@435: duke@435: // Compute the alias index. Loads and stores with different alias duke@435: // indices do not need anti-dependence edges. Wide MemBar's are duke@435: // anti-dependent on everything (except immutable memories). duke@435: const TypePtr* adr_type = store->adr_type(); duke@435: if (!C->can_alias(adr_type, load_alias_idx)) continue; duke@435: duke@435: // Most slow-path runtime calls do NOT modify Java memory, but duke@435: // they can block and so write Raw memory. duke@435: if (store->is_Mach()) { duke@435: MachNode* mstore = store->as_Mach(); duke@435: if (load_alias_idx != Compile::AliasIdxRaw) { duke@435: // Check for call into the runtime using the Java calling duke@435: // convention (and from there into a wrapper); it has no duke@435: // _method. Can't do this optimization for Native calls because duke@435: // they CAN write to Java memory. duke@435: if (mstore->ideal_Opcode() == Op_CallStaticJava) { duke@435: assert(mstore->is_MachSafePoint(), ""); duke@435: MachSafePointNode* ms = (MachSafePointNode*) mstore; duke@435: assert(ms->is_MachCallJava(), ""); duke@435: MachCallJavaNode* mcj = (MachCallJavaNode*) ms; duke@435: if (mcj->_method == NULL) { duke@435: // These runtime calls do not write to Java visible memory duke@435: // (other than Raw) and so do not require anti-dependence edges. duke@435: continue; duke@435: } duke@435: } duke@435: // Same for SafePoints: they read/write Raw but only read otherwise. duke@435: // This is basically a workaround for SafePoints only defining control duke@435: // instead of control + memory. duke@435: if (mstore->ideal_Opcode() == Op_SafePoint) duke@435: continue; duke@435: } else { duke@435: // Some raw memory, such as the load of "top" at an allocation, duke@435: // can be control dependent on the previous safepoint. See duke@435: // comments in GraphKit::allocate_heap() about control input. duke@435: // Inserting an anti-dep between such a safepoint and a use duke@435: // creates a cycle, and will cause a subsequent failure in duke@435: // local scheduling. (BugId 4919904) duke@435: // (%%% How can a control input be a safepoint and not a projection??) duke@435: if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore) duke@435: continue; duke@435: } duke@435: } duke@435: duke@435: // Identify a block that the current load must be above, duke@435: // or else observe that 'store' is all the way up in the duke@435: // earliest legal block for 'load'. In the latter case, duke@435: // immediately insert an anti-dependence edge. duke@435: Block* store_block = _bbs[store->_idx]; duke@435: assert(store_block != NULL, "unused killing projections skipped above"); duke@435: duke@435: if (store->is_Phi()) { duke@435: // 'load' uses memory which is one (or more) of the Phi's inputs. duke@435: // It must be scheduled not before the Phi, but rather before duke@435: // each of the relevant Phi inputs. duke@435: // duke@435: // Instead of finding the LCA of all inputs to a Phi that match 'mem', duke@435: // we mark each corresponding predecessor block and do a combined duke@435: // hoisting operation later (raise_LCA_above_marks). duke@435: // duke@435: // Do not assert(store_block != early, "Phi merging memory after access") duke@435: // PhiNode may be at start of block 'early' with backedge to 'early' duke@435: DEBUG_ONLY(bool found_match = false); duke@435: for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) { duke@435: if (store->in(j) == mem) { // Found matching input? duke@435: DEBUG_ONLY(found_match = true); duke@435: Block* pred_block = _bbs[store_block->pred(j)->_idx]; duke@435: if (pred_block != early) { duke@435: // If any predecessor of the Phi matches the load's "early block", duke@435: // we do not need a precedence edge between the Phi and 'load' duke@435: // since the load will be forced into a block preceeding the Phi. duke@435: pred_block->set_raise_LCA_mark(load_index); duke@435: assert(!LCA_orig->dominates(pred_block) || duke@435: early->dominates(pred_block), "early is high enough"); duke@435: must_raise_LCA = true; duke@435: } duke@435: } duke@435: } duke@435: assert(found_match, "no worklist bug"); duke@435: #ifdef TRACK_PHI_INPUTS duke@435: #ifdef ASSERT duke@435: // This assert asks about correct handling of PhiNodes, which may not duke@435: // have all input edges directly from 'mem'. See BugId 4621264 duke@435: int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1; duke@435: // Increment by exactly one even if there are multiple copies of 'mem' duke@435: // coming into the phi, because we will run this block several times duke@435: // if there are several copies of 'mem'. (That's how DU iterators work.) duke@435: phi_inputs.at_put(store->_idx, num_mem_inputs); duke@435: assert(PhiNode::Input + num_mem_inputs < store->req(), duke@435: "Expect at least one phi input will not be from original memory state"); duke@435: #endif //ASSERT duke@435: #endif //TRACK_PHI_INPUTS duke@435: } else if (store_block != early) { duke@435: // 'store' is between the current LCA and earliest possible block. duke@435: // Label its block, and decide later on how to raise the LCA duke@435: // to include the effect on LCA of this store. duke@435: // If this store's block gets chosen as the raised LCA, we duke@435: // will find him on the non_early_stores list and stick him duke@435: // with a precedence edge. duke@435: // (But, don't bother if LCA is already raised all the way.) duke@435: if (LCA != early) { duke@435: store_block->set_raise_LCA_mark(load_index); duke@435: must_raise_LCA = true; duke@435: non_early_stores.push(store); duke@435: } duke@435: } else { duke@435: // Found a possibly-interfering store in the load's 'early' block. duke@435: // This means 'load' cannot sink at all in the dominator tree. duke@435: // Add an anti-dep edge, and squeeze 'load' into the highest block. duke@435: assert(store != load->in(0), "dependence cycle found"); duke@435: if (verify) { duke@435: assert(store->find_edge(load) != -1, "missing precedence edge"); duke@435: } else { duke@435: store->add_prec(load); duke@435: } duke@435: LCA = early; duke@435: // This turns off the process of gathering non_early_stores. duke@435: } duke@435: } duke@435: // (Worklist is now empty; all nearby stores have been visited.) duke@435: duke@435: // Finished if 'load' must be scheduled in its 'early' block. duke@435: // If we found any stores there, they have already been given duke@435: // precedence edges. duke@435: if (LCA == early) return LCA; duke@435: duke@435: // We get here only if there are no possibly-interfering stores duke@435: // in the load's 'early' block. Move LCA up above all predecessors duke@435: // which contain stores we have noted. duke@435: // duke@435: // The raised LCA block can be a home to such interfering stores, duke@435: // but its predecessors must not contain any such stores. duke@435: // duke@435: // The raised LCA will be a lower bound for placing the load, duke@435: // preventing the load from sinking past any block containing duke@435: // a store that may invalidate the memory state required by 'load'. duke@435: if (must_raise_LCA) duke@435: LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs); duke@435: if (LCA == early) return LCA; duke@435: duke@435: // Insert anti-dependence edges from 'load' to each store duke@435: // in the non-early LCA block. duke@435: // Mine the non_early_stores list for such stores. duke@435: if (LCA->raise_LCA_mark() == load_index) { duke@435: while (non_early_stores.size() > 0) { duke@435: Node* store = non_early_stores.pop(); duke@435: Block* store_block = _bbs[store->_idx]; duke@435: if (store_block == LCA) { duke@435: // add anti_dependence from store to load in its own block duke@435: assert(store != load->in(0), "dependence cycle found"); duke@435: if (verify) { duke@435: assert(store->find_edge(load) != -1, "missing precedence edge"); duke@435: } else { duke@435: store->add_prec(load); duke@435: } duke@435: } else { duke@435: assert(store_block->raise_LCA_mark() == load_index, "block was marked"); duke@435: // Any other stores we found must be either inside the new LCA duke@435: // or else outside the original LCA. In the latter case, they duke@435: // did not interfere with any use of 'load'. duke@435: assert(LCA->dominates(store_block) duke@435: || !LCA_orig->dominates(store_block), "no stray stores"); duke@435: } duke@435: } duke@435: } duke@435: duke@435: // Return the highest block containing stores; any stores duke@435: // within that block have been given anti-dependence edges. duke@435: return LCA; duke@435: } duke@435: duke@435: // This class is used to iterate backwards over the nodes in the graph. duke@435: duke@435: class Node_Backward_Iterator { duke@435: duke@435: private: duke@435: Node_Backward_Iterator(); duke@435: duke@435: public: duke@435: // Constructor for the iterator duke@435: Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs); duke@435: duke@435: // Postincrement operator to iterate over the nodes duke@435: Node *next(); duke@435: duke@435: private: duke@435: VectorSet &_visited; duke@435: Node_List &_stack; duke@435: Block_Array &_bbs; duke@435: }; duke@435: duke@435: // Constructor for the Node_Backward_Iterator duke@435: Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs ) duke@435: : _visited(visited), _stack(stack), _bbs(bbs) { duke@435: // The stack should contain exactly the root duke@435: stack.clear(); duke@435: stack.push(root); duke@435: duke@435: // Clear the visited bits duke@435: visited.Clear(); duke@435: } duke@435: duke@435: // Iterator for the Node_Backward_Iterator duke@435: Node *Node_Backward_Iterator::next() { duke@435: duke@435: // If the _stack is empty, then just return NULL: finished. duke@435: if ( !_stack.size() ) duke@435: return NULL; duke@435: duke@435: // '_stack' is emulating a real _stack. The 'visit-all-users' loop has been duke@435: // made stateless, so I do not need to record the index 'i' on my _stack. duke@435: // Instead I visit all users each time, scanning for unvisited users. duke@435: // I visit unvisited not-anti-dependence users first, then anti-dependent duke@435: // children next. duke@435: Node *self = _stack.pop(); duke@435: duke@435: // I cycle here when I am entering a deeper level of recursion. duke@435: // The key variable 'self' was set prior to jumping here. duke@435: while( 1 ) { duke@435: duke@435: _visited.set(self->_idx); duke@435: duke@435: // Now schedule all uses as late as possible. duke@435: uint src = self->is_Proj() ? self->in(0)->_idx : self->_idx; duke@435: uint src_rpo = _bbs[src]->_rpo; duke@435: duke@435: // Schedule all nodes in a post-order visit duke@435: Node *unvisited = NULL; // Unvisited anti-dependent Node, if any duke@435: duke@435: // Scan for unvisited nodes duke@435: for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { duke@435: // For all uses, schedule late duke@435: Node* n = self->fast_out(i); // Use duke@435: duke@435: // Skip already visited children duke@435: if ( _visited.test(n->_idx) ) duke@435: continue; duke@435: duke@435: // do not traverse backward control edges duke@435: Node *use = n->is_Proj() ? n->in(0) : n; duke@435: uint use_rpo = _bbs[use->_idx]->_rpo; duke@435: duke@435: if ( use_rpo < src_rpo ) duke@435: continue; duke@435: duke@435: // Phi nodes always precede uses in a basic block duke@435: if ( use_rpo == src_rpo && use->is_Phi() ) duke@435: continue; duke@435: duke@435: unvisited = n; // Found unvisited duke@435: duke@435: // Check for possible-anti-dependent duke@435: if( !n->needs_anti_dependence_check() ) duke@435: break; // Not visited, not anti-dep; schedule it NOW duke@435: } duke@435: duke@435: // Did I find an unvisited not-anti-dependent Node? duke@435: if ( !unvisited ) duke@435: break; // All done with children; post-visit 'self' duke@435: duke@435: // Visit the unvisited Node. Contains the obvious push to duke@435: // indicate I'm entering a deeper level of recursion. I push the duke@435: // old state onto the _stack and set a new state and loop (recurse). duke@435: _stack.push(self); duke@435: self = unvisited; duke@435: } // End recursion loop duke@435: duke@435: return self; duke@435: } duke@435: duke@435: //------------------------------ComputeLatenciesBackwards---------------------- duke@435: // Compute the latency of all the instructions. duke@435: void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) { duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) duke@435: tty->print("\n#---- ComputeLatenciesBackwards ----\n"); duke@435: #endif duke@435: duke@435: Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs); duke@435: Node *n; duke@435: duke@435: // Walk over all the nodes from last to first duke@435: while (n = iter.next()) { duke@435: // Set the latency for the definitions of this instruction duke@435: partial_latency_of_defs(n); duke@435: } duke@435: } // end ComputeLatenciesBackwards duke@435: duke@435: //------------------------------partial_latency_of_defs------------------------ duke@435: // Compute the latency impact of this node on all defs. This computes duke@435: // a number that increases as we approach the beginning of the routine. duke@435: void PhaseCFG::partial_latency_of_defs(Node *n) { duke@435: // Set the latency for this instruction duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("# latency_to_inputs: node_latency[%d] = %d for node", duke@435: n->_idx, _node_latency.at_grow(n->_idx)); duke@435: dump(); duke@435: } duke@435: #endif duke@435: duke@435: if (n->is_Proj()) duke@435: n = n->in(0); duke@435: duke@435: if (n->is_Root()) duke@435: return; duke@435: duke@435: uint nlen = n->len(); duke@435: uint use_latency = _node_latency.at_grow(n->_idx); duke@435: uint use_pre_order = _bbs[n->_idx]->_pre_order; duke@435: duke@435: for ( uint j=0; jin(j); duke@435: duke@435: if (!def || def == n) duke@435: continue; duke@435: duke@435: // Walk backwards thru projections duke@435: if (def->is_Proj()) duke@435: def = def->in(0); duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("# in(%2d): ", j); duke@435: def->dump(); duke@435: } duke@435: #endif duke@435: duke@435: // If the defining block is not known, assume it is ok duke@435: Block *def_block = _bbs[def->_idx]; duke@435: uint def_pre_order = def_block ? def_block->_pre_order : 0; duke@435: duke@435: if ( (use_pre_order < def_pre_order) || duke@435: (use_pre_order == def_pre_order && n->is_Phi()) ) duke@435: continue; duke@435: duke@435: uint delta_latency = n->latency(j); duke@435: uint current_latency = delta_latency + use_latency; duke@435: duke@435: if (_node_latency.at_grow(def->_idx) < current_latency) { duke@435: _node_latency.at_put_grow(def->_idx, current_latency); duke@435: } duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", duke@435: use_latency, j, delta_latency, current_latency, def->_idx, duke@435: _node_latency.at_grow(def->_idx)); duke@435: } duke@435: #endif duke@435: } duke@435: } duke@435: duke@435: //------------------------------latency_from_use------------------------------- duke@435: // Compute the latency of a specific use duke@435: int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) { duke@435: // If self-reference, return no latency duke@435: if (use == n || use->is_Root()) duke@435: return 0; duke@435: duke@435: uint def_pre_order = _bbs[def->_idx]->_pre_order; duke@435: uint latency = 0; duke@435: duke@435: // If the use is not a projection, then it is simple... duke@435: if (!use->is_Proj()) { duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("# out(): "); duke@435: use->dump(); duke@435: } duke@435: #endif duke@435: duke@435: uint use_pre_order = _bbs[use->_idx]->_pre_order; duke@435: duke@435: if (use_pre_order < def_pre_order) duke@435: return 0; duke@435: duke@435: if (use_pre_order == def_pre_order && use->is_Phi()) duke@435: return 0; duke@435: duke@435: uint nlen = use->len(); duke@435: uint nl = _node_latency.at_grow(use->_idx); duke@435: duke@435: for ( uint j=0; jin(j) == n) { duke@435: // Change this if we want local latencies duke@435: uint ul = use->latency(j); duke@435: uint l = ul + nl; duke@435: if (latency < l) latency = l; duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d", duke@435: nl, j, ul, l, latency); duke@435: } duke@435: #endif duke@435: } duke@435: } duke@435: } else { duke@435: // This is a projection, just grab the latency of the use(s) duke@435: for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) { duke@435: uint l = latency_from_use(use, def, use->fast_out(j)); duke@435: if (latency < l) latency = l; duke@435: } duke@435: } duke@435: duke@435: return latency; duke@435: } duke@435: duke@435: //------------------------------latency_from_uses------------------------------ duke@435: // Compute the latency of this instruction relative to all of it's uses. duke@435: // This computes a number that increases as we approach the beginning of the duke@435: // routine. duke@435: void PhaseCFG::latency_from_uses(Node *n) { duke@435: // Set the latency for this instruction duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("# latency_from_outputs: node_latency[%d] = %d for node", duke@435: n->_idx, _node_latency.at_grow(n->_idx)); duke@435: dump(); duke@435: } duke@435: #endif duke@435: uint latency=0; duke@435: const Node *def = n->is_Proj() ? n->in(0): n; duke@435: duke@435: for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { duke@435: uint l = latency_from_use(n, def, n->fast_out(i)); duke@435: duke@435: if (latency < l) latency = l; duke@435: } duke@435: duke@435: _node_latency.at_put_grow(n->_idx, latency); duke@435: } duke@435: duke@435: //------------------------------hoist_to_cheaper_block------------------------- duke@435: // Pick a block for node self, between early and LCA, that is a cheaper duke@435: // alternative to LCA. duke@435: Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) { duke@435: const double delta = 1+PROB_UNLIKELY_MAG(4); duke@435: Block* least = LCA; duke@435: double least_freq = least->_freq; duke@435: uint target = _node_latency.at_grow(self->_idx); duke@435: uint start_latency = _node_latency.at_grow(LCA->_nodes[0]->_idx); duke@435: uint end_latency = _node_latency.at_grow(LCA->_nodes[LCA->end_idx()]->_idx); duke@435: bool in_latency = (target <= start_latency); duke@435: const Block* root_block = _bbs[_root->_idx]; duke@435: duke@435: // Turn off latency scheduling if scheduling is just plain off duke@435: if (!C->do_scheduling()) duke@435: in_latency = true; duke@435: duke@435: // Do not hoist (to cover latency) instructions which target a duke@435: // single register. Hoisting stretches the live range of the duke@435: // single register and may force spilling. duke@435: MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; duke@435: if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty()) duke@435: in_latency = true; duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("# Find cheaper block for latency %d: ", duke@435: _node_latency.at_grow(self->_idx)); duke@435: self->dump(); duke@435: tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", duke@435: LCA->_pre_order, duke@435: LCA->_nodes[0]->_idx, duke@435: start_latency, duke@435: LCA->_nodes[LCA->end_idx()]->_idx, duke@435: end_latency, duke@435: least_freq); duke@435: } duke@435: #endif duke@435: duke@435: // Walk up the dominator tree from LCA (Lowest common ancestor) to duke@435: // the earliest legal location. Capture the least execution frequency. duke@435: while (LCA != early) { duke@435: LCA = LCA->_idom; // Follow up the dominator tree duke@435: duke@435: if (LCA == NULL) { duke@435: // Bailout without retry duke@435: C->record_method_not_compilable("late schedule failed: LCA == NULL"); duke@435: return least; duke@435: } duke@435: duke@435: // Don't hoist machine instructions to the root basic block duke@435: if (mach && LCA == root_block) duke@435: break; duke@435: duke@435: uint start_lat = _node_latency.at_grow(LCA->_nodes[0]->_idx); duke@435: uint end_idx = LCA->end_idx(); duke@435: uint end_lat = _node_latency.at_grow(LCA->_nodes[end_idx]->_idx); duke@435: double LCA_freq = LCA->_freq; duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", duke@435: LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq); duke@435: } duke@435: #endif duke@435: if (LCA_freq < least_freq || // Better Frequency duke@435: ( !in_latency && // No block containing latency duke@435: LCA_freq < least_freq * delta && // No worse frequency duke@435: target >= end_lat && // within latency range duke@435: !self->is_iteratively_computed() ) // But don't hoist IV increments duke@435: // because they may end up above other uses of their phi forcing duke@435: // their result register to be different from their input. duke@435: ) { duke@435: least = LCA; // Found cheaper block duke@435: least_freq = LCA_freq; duke@435: start_latency = start_lat; duke@435: end_latency = end_lat; duke@435: if (target <= start_lat) duke@435: in_latency = true; duke@435: } duke@435: } duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print_cr("# Choose block B%d with start latency=%d and freq=%g", duke@435: least->_pre_order, start_latency, least_freq); duke@435: } duke@435: #endif duke@435: duke@435: // See if the latency needs to be updated duke@435: if (target < end_latency) { duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency); duke@435: } duke@435: #endif duke@435: _node_latency.at_put_grow(self->_idx, end_latency); duke@435: partial_latency_of_defs(self); duke@435: } duke@435: duke@435: return least; duke@435: } duke@435: duke@435: duke@435: //------------------------------schedule_late----------------------------------- duke@435: // Now schedule all codes as LATE as possible. This is the LCA in the duke@435: // dominator tree of all USES of a value. Pick the block with the least duke@435: // loop nesting depth that is lowest in the dominator tree. duke@435: extern const char must_clone[]; duke@435: void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) { duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) duke@435: tty->print("\n#---- schedule_late ----\n"); duke@435: #endif duke@435: duke@435: Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs); duke@435: Node *self; duke@435: duke@435: // Walk over all the nodes from last to first duke@435: while (self = iter.next()) { duke@435: Block* early = _bbs[self->_idx]; // Earliest legal placement duke@435: duke@435: if (self->is_top()) { duke@435: // Top node goes in bb #2 with other constants. duke@435: // It must be special-cased, because it has no out edges. duke@435: early->add_inst(self); duke@435: continue; duke@435: } duke@435: duke@435: // No uses, just terminate duke@435: if (self->outcnt() == 0) { duke@435: assert(self->Opcode() == Op_MachProj, "sanity"); duke@435: continue; // Must be a dead machine projection duke@435: } duke@435: duke@435: // If node is pinned in the block, then no scheduling can be done. duke@435: if( self->pinned() ) // Pinned in block? duke@435: continue; duke@435: duke@435: MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; duke@435: if (mach) { duke@435: switch (mach->ideal_Opcode()) { duke@435: case Op_CreateEx: duke@435: // Don't move exception creation duke@435: early->add_inst(self); duke@435: continue; duke@435: break; duke@435: case Op_CheckCastPP: duke@435: // Don't move CheckCastPP nodes away from their input, if the input duke@435: // is a rawptr (5071820). duke@435: Node *def = self->in(1); duke@435: if (def != NULL && def->bottom_type()->base() == Type::RawPtr) { duke@435: early->add_inst(self); duke@435: continue; duke@435: } duke@435: break; duke@435: } duke@435: } duke@435: duke@435: // Gather LCA of all uses duke@435: Block *LCA = NULL; duke@435: { duke@435: for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { duke@435: // For all uses, find LCA duke@435: Node* use = self->fast_out(i); duke@435: LCA = raise_LCA_above_use(LCA, use, self, _bbs); duke@435: } duke@435: } // (Hide defs of imax, i from rest of block.) duke@435: duke@435: // Place temps in the block of their use. This isn't a duke@435: // requirement for correctness but it reduces useless duke@435: // interference between temps and other nodes. duke@435: if (mach != NULL && mach->is_MachTemp()) { duke@435: _bbs.map(self->_idx, LCA); duke@435: LCA->add_inst(self); duke@435: continue; duke@435: } duke@435: duke@435: // Check if 'self' could be anti-dependent on memory duke@435: if (self->needs_anti_dependence_check()) { duke@435: // Hoist LCA above possible-defs and insert anti-dependences to duke@435: // defs in new LCA block. duke@435: LCA = insert_anti_dependences(LCA, self); duke@435: } duke@435: duke@435: if (early->_dom_depth > LCA->_dom_depth) { duke@435: // Somehow the LCA has moved above the earliest legal point. duke@435: // (One way this can happen is via memory_early_block.) duke@435: if (C->subsume_loads() == true && !C->failing()) { duke@435: // Retry with subsume_loads == false duke@435: // If this is the first failure, the sentinel string will "stick" duke@435: // to the Compile object, and the C2Compiler will see it and retry. duke@435: C->record_failure(C2Compiler::retry_no_subsuming_loads()); duke@435: } else { duke@435: // Bailout without retry when (early->_dom_depth > LCA->_dom_depth) duke@435: C->record_method_not_compilable("late schedule failed: incorrect graph"); duke@435: } duke@435: return; duke@435: } duke@435: duke@435: // If there is no opportunity to hoist, then we're done. duke@435: bool try_to_hoist = (LCA != early); duke@435: duke@435: // Must clone guys stay next to use; no hoisting allowed. duke@435: // Also cannot hoist guys that alter memory or are otherwise not duke@435: // allocatable (hoisting can make a value live longer, leading to duke@435: // anti and output dependency problems which are normally resolved duke@435: // by the register allocator giving everyone a different register). duke@435: if (mach != NULL && must_clone[mach->ideal_Opcode()]) duke@435: try_to_hoist = false; duke@435: duke@435: Block* late = NULL; duke@435: if (try_to_hoist) { duke@435: // Now find the block with the least execution frequency. duke@435: // Start at the latest schedule and work up to the earliest schedule duke@435: // in the dominator tree. Thus the Node will dominate all its uses. duke@435: late = hoist_to_cheaper_block(LCA, early, self); duke@435: } else { duke@435: // Just use the LCA of the uses. duke@435: late = LCA; duke@435: } duke@435: duke@435: // Put the node into target block duke@435: schedule_node_into_block(self, late); duke@435: duke@435: #ifdef ASSERT duke@435: if (self->needs_anti_dependence_check()) { duke@435: // since precedence edges are only inserted when we're sure they duke@435: // are needed make sure that after placement in a block we don't duke@435: // need any new precedence edges. duke@435: verify_anti_dependences(late, self); duke@435: } duke@435: #endif duke@435: } // Loop until all nodes have been visited duke@435: duke@435: } // end ScheduleLate duke@435: duke@435: //------------------------------GlobalCodeMotion------------------------------- duke@435: void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) { duke@435: ResourceMark rm; duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("\n---- Start GlobalCodeMotion ----\n"); duke@435: } duke@435: #endif duke@435: duke@435: // Initialize the bbs.map for things on the proj_list duke@435: uint i; duke@435: for( i=0; i < proj_list.size(); i++ ) duke@435: _bbs.map(proj_list[i]->_idx, NULL); duke@435: duke@435: // Set the basic block for Nodes pinned into blocks duke@435: Arena *a = Thread::current()->resource_area(); duke@435: VectorSet visited(a); duke@435: schedule_pinned_nodes( visited ); duke@435: duke@435: // Find the earliest Block any instruction can be placed in. Some duke@435: // instructions are pinned into Blocks. Unpinned instructions can duke@435: // appear in last block in which all their inputs occur. duke@435: visited.Clear(); duke@435: Node_List stack(a); duke@435: stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list duke@435: if (!schedule_early(visited, stack)) { duke@435: // Bailout without retry duke@435: C->record_method_not_compilable("early schedule failed"); duke@435: return; duke@435: } duke@435: duke@435: // Build Def-Use edges. duke@435: proj_list.push(_root); // Add real root as another root duke@435: proj_list.pop(); duke@435: duke@435: // Compute the latency information (via backwards walk) for all the duke@435: // instructions in the graph duke@435: GrowableArray node_latency; duke@435: _node_latency = node_latency; duke@435: duke@435: if( C->do_scheduling() ) duke@435: ComputeLatenciesBackwards(visited, stack); duke@435: duke@435: // Now schedule all codes as LATE as possible. This is the LCA in the duke@435: // dominator tree of all USES of a value. Pick the block with the least duke@435: // loop nesting depth that is lowest in the dominator tree. duke@435: // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() ) duke@435: schedule_late(visited, stack); duke@435: if( C->failing() ) { duke@435: // schedule_late fails only when graph is incorrect. duke@435: assert(!VerifyGraphEdges, "verification should have failed"); duke@435: return; duke@435: } duke@435: duke@435: unique = C->unique(); duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("\n---- Detect implicit null checks ----\n"); duke@435: } duke@435: #endif duke@435: duke@435: // Detect implicit-null-check opportunities. Basically, find NULL checks duke@435: // with suitable memory ops nearby. Use the memory op to do the NULL check. duke@435: // I can generate a memory op if there is not one nearby. duke@435: if (C->is_method_compilation()) { duke@435: // Don't do it for natives, adapters, or runtime stubs duke@435: int allowed_reasons = 0; duke@435: // ...and don't do it when there have been too many traps, globally. duke@435: for (int reason = (int)Deoptimization::Reason_none+1; duke@435: reason < Compile::trapHistLength; reason++) { duke@435: assert(reason < BitsPerInt, "recode bit map"); duke@435: if (!C->too_many_traps((Deoptimization::DeoptReason) reason)) duke@435: allowed_reasons |= nth_bit(reason); duke@435: } duke@435: // By reversing the loop direction we get a very minor gain on mpegaudio. duke@435: // Feel free to revert to a forward loop for clarity. duke@435: // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) { duke@435: for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) { duke@435: Node *proj = matcher._null_check_tests[i ]; duke@435: Node *val = matcher._null_check_tests[i+1]; duke@435: _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons); duke@435: // The implicit_null_check will only perform the transformation duke@435: // if the null branch is truly uncommon, *and* it leads to an duke@435: // uncommon trap. Combined with the too_many_traps guards duke@435: // above, this prevents SEGV storms reported in 6366351, duke@435: // by recompiling offending methods without this optimization. duke@435: } duke@435: } duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("\n---- Start Local Scheduling ----\n"); duke@435: } duke@435: #endif duke@435: duke@435: // Schedule locally. Right now a simple topological sort. duke@435: // Later, do a real latency aware scheduler. duke@435: int *ready_cnt = NEW_RESOURCE_ARRAY(int,C->unique()); duke@435: memset( ready_cnt, -1, C->unique() * sizeof(int) ); duke@435: visited.Clear(); duke@435: for (i = 0; i < _num_blocks; i++) { duke@435: if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) { duke@435: if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) { duke@435: C->record_method_not_compilable("local schedule failed"); duke@435: } duke@435: return; duke@435: } duke@435: } duke@435: duke@435: // If we inserted any instructions between a Call and his CatchNode, duke@435: // clone the instructions on all paths below the Catch. duke@435: for( i=0; i < _num_blocks; i++ ) duke@435: _blocks[i]->call_catch_cleanup(_bbs); duke@435: duke@435: #ifndef PRODUCT duke@435: if (trace_opto_pipelining()) { duke@435: tty->print("\n---- After GlobalCodeMotion ----\n"); duke@435: for (uint i = 0; i < _num_blocks; i++) { duke@435: _blocks[i]->dump(); duke@435: } duke@435: } duke@435: #endif duke@435: } duke@435: duke@435: duke@435: //------------------------------Estimate_Block_Frequency----------------------- duke@435: // Estimate block frequencies based on IfNode probabilities. duke@435: void PhaseCFG::Estimate_Block_Frequency() { duke@435: int cnts = C->method() ? C->method()->interpreter_invocation_count() : 1; duke@435: // Most of our algorithms will die horribly if frequency can become duke@435: // negative so make sure cnts is a sane value. duke@435: if( cnts <= 0 ) cnts = 1; duke@435: float f = (float)cnts/(float)FreqCountInvocations; duke@435: duke@435: // Create the loop tree and calculate loop depth. duke@435: _root_loop = create_loop_tree(); duke@435: _root_loop->compute_loop_depth(0); duke@435: duke@435: // Compute block frequency of each block, relative to a single loop entry. duke@435: _root_loop->compute_freq(); duke@435: duke@435: // Adjust all frequencies to be relative to a single method entry duke@435: _root_loop->_freq = f * 1.0; duke@435: _root_loop->scale_freq(); duke@435: duke@435: // force paths ending at uncommon traps to be infrequent duke@435: Block_List worklist; duke@435: Block* root_blk = _blocks[0]; duke@435: for (uint i = 0; i < root_blk->num_preds(); i++) { duke@435: Block *pb = _bbs[root_blk->pred(i)->_idx]; duke@435: if (pb->has_uncommon_code()) { duke@435: worklist.push(pb); duke@435: } duke@435: } duke@435: while (worklist.size() > 0) { duke@435: Block* uct = worklist.pop(); duke@435: uct->_freq = PROB_MIN; duke@435: for (uint i = 0; i < uct->num_preds(); i++) { duke@435: Block *pb = _bbs[uct->pred(i)->_idx]; duke@435: if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) { duke@435: worklist.push(pb); duke@435: } duke@435: } duke@435: } duke@435: duke@435: #ifndef PRODUCT duke@435: if (PrintCFGBlockFreq) { duke@435: tty->print_cr("CFG Block Frequencies"); duke@435: _root_loop->dump_tree(); duke@435: if (Verbose) { duke@435: tty->print_cr("PhaseCFG dump"); duke@435: dump(); duke@435: tty->print_cr("Node dump"); duke@435: _root->dump(99999); duke@435: } duke@435: } duke@435: #endif duke@435: } duke@435: duke@435: //----------------------------create_loop_tree-------------------------------- duke@435: // Create a loop tree from the CFG duke@435: CFGLoop* PhaseCFG::create_loop_tree() { duke@435: duke@435: #ifdef ASSERT duke@435: assert( _blocks[0] == _broot, "" ); duke@435: for (uint i = 0; i < _num_blocks; i++ ) { duke@435: Block *b = _blocks[i]; duke@435: // Check that _loop field are clear...we could clear them if not. duke@435: assert(b->_loop == NULL, "clear _loop expected"); duke@435: // Sanity check that the RPO numbering is reflected in the _blocks array. duke@435: // It doesn't have to be for the loop tree to be built, but if it is not, duke@435: // then the blocks have been reordered since dom graph building...which duke@435: // may question the RPO numbering duke@435: assert(b->_rpo == i, "unexpected reverse post order number"); duke@435: } duke@435: #endif duke@435: duke@435: int idct = 0; duke@435: CFGLoop* root_loop = new CFGLoop(idct++); duke@435: duke@435: Block_List worklist; duke@435: duke@435: // Assign blocks to loops duke@435: for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block duke@435: Block *b = _blocks[i]; duke@435: duke@435: if (b->head()->is_Loop()) { duke@435: Block* loop_head = b; duke@435: assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); duke@435: Node* tail_n = loop_head->pred(LoopNode::LoopBackControl); duke@435: Block* tail = _bbs[tail_n->_idx]; duke@435: duke@435: // Defensively filter out Loop nodes for non-single-entry loops. duke@435: // For all reasonable loops, the head occurs before the tail in RPO. duke@435: if (i <= tail->_rpo) { duke@435: duke@435: // The tail and (recursive) predecessors of the tail duke@435: // are made members of a new loop. duke@435: duke@435: assert(worklist.size() == 0, "nonempty worklist"); duke@435: CFGLoop* nloop = new CFGLoop(idct++); duke@435: assert(loop_head->_loop == NULL, "just checking"); duke@435: loop_head->_loop = nloop; duke@435: // Add to nloop so push_pred() will skip over inner loops duke@435: nloop->add_member(loop_head); duke@435: nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs); duke@435: duke@435: while (worklist.size() > 0) { duke@435: Block* member = worklist.pop(); duke@435: if (member != loop_head) { duke@435: for (uint j = 1; j < member->num_preds(); j++) { duke@435: nloop->push_pred(member, j, worklist, _bbs); duke@435: } duke@435: } duke@435: } duke@435: } duke@435: } duke@435: } duke@435: duke@435: // Create a member list for each loop consisting duke@435: // of both blocks and (immediate child) loops. duke@435: for (uint i = 0; i < _num_blocks; i++) { duke@435: Block *b = _blocks[i]; duke@435: CFGLoop* lp = b->_loop; duke@435: if (lp == NULL) { duke@435: // Not assigned to a loop. Add it to the method's pseudo loop. duke@435: b->_loop = root_loop; duke@435: lp = root_loop; duke@435: } duke@435: if (lp == root_loop || b != lp->head()) { // loop heads are already members duke@435: lp->add_member(b); duke@435: } duke@435: if (lp != root_loop) { duke@435: if (lp->parent() == NULL) { duke@435: // Not a nested loop. Make it a child of the method's pseudo loop. duke@435: root_loop->add_nested_loop(lp); duke@435: } duke@435: if (b == lp->head()) { duke@435: // Add nested loop to member list of parent loop. duke@435: lp->parent()->add_member(lp); duke@435: } duke@435: } duke@435: } duke@435: duke@435: return root_loop; duke@435: } duke@435: duke@435: //------------------------------push_pred-------------------------------------- duke@435: void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) { duke@435: Node* pred_n = blk->pred(i); duke@435: Block* pred = node_to_blk[pred_n->_idx]; duke@435: CFGLoop *pred_loop = pred->_loop; duke@435: if (pred_loop == NULL) { duke@435: // Filter out blocks for non-single-entry loops. duke@435: // For all reasonable loops, the head occurs before the tail in RPO. duke@435: if (pred->_rpo > head()->_rpo) { duke@435: pred->_loop = this; duke@435: worklist.push(pred); duke@435: } duke@435: } else if (pred_loop != this) { duke@435: // Nested loop. duke@435: while (pred_loop->_parent != NULL && pred_loop->_parent != this) { duke@435: pred_loop = pred_loop->_parent; duke@435: } duke@435: // Make pred's loop be a child duke@435: if (pred_loop->_parent == NULL) { duke@435: add_nested_loop(pred_loop); duke@435: // Continue with loop entry predecessor. duke@435: Block* pred_head = pred_loop->head(); duke@435: assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); duke@435: assert(pred_head != head(), "loop head in only one loop"); duke@435: push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk); duke@435: } else { duke@435: assert(pred_loop->_parent == this && _parent == NULL, "just checking"); duke@435: } duke@435: } duke@435: } duke@435: duke@435: //------------------------------add_nested_loop-------------------------------- duke@435: // Make cl a child of the current loop in the loop tree. duke@435: void CFGLoop::add_nested_loop(CFGLoop* cl) { duke@435: assert(_parent == NULL, "no parent yet"); duke@435: assert(cl != this, "not my own parent"); duke@435: cl->_parent = this; duke@435: CFGLoop* ch = _child; duke@435: if (ch == NULL) { duke@435: _child = cl; duke@435: } else { duke@435: while (ch->_sibling != NULL) { ch = ch->_sibling; } duke@435: ch->_sibling = cl; duke@435: } duke@435: } duke@435: duke@435: //------------------------------compute_loop_depth----------------------------- duke@435: // Store the loop depth in each CFGLoop object. duke@435: // Recursively walk the children to do the same for them. duke@435: void CFGLoop::compute_loop_depth(int depth) { duke@435: _depth = depth; duke@435: CFGLoop* ch = _child; duke@435: while (ch != NULL) { duke@435: ch->compute_loop_depth(depth + 1); duke@435: ch = ch->_sibling; duke@435: } duke@435: } duke@435: duke@435: //------------------------------compute_freq----------------------------------- duke@435: // Compute the frequency of each block and loop, relative to a single entry duke@435: // into the dominating loop head. duke@435: void CFGLoop::compute_freq() { duke@435: // Bottom up traversal of loop tree (visit inner loops first.) duke@435: // Set loop head frequency to 1.0, then transitively duke@435: // compute frequency for all successors in the loop, duke@435: // as well as for each exit edge. Inner loops are duke@435: // treated as single blocks with loop exit targets duke@435: // as the successor blocks. duke@435: duke@435: // Nested loops first duke@435: CFGLoop* ch = _child; duke@435: while (ch != NULL) { duke@435: ch->compute_freq(); duke@435: ch = ch->_sibling; duke@435: } duke@435: assert (_members.length() > 0, "no empty loops"); duke@435: Block* hd = head(); duke@435: hd->_freq = 1.0f; duke@435: for (int i = 0; i < _members.length(); i++) { duke@435: CFGElement* s = _members.at(i); duke@435: float freq = s->_freq; duke@435: if (s->is_block()) { duke@435: Block* b = s->as_Block(); duke@435: for (uint j = 0; j < b->_num_succs; j++) { duke@435: Block* sb = b->_succs[j]; duke@435: update_succ_freq(sb, freq * b->succ_prob(j)); duke@435: } duke@435: } else { duke@435: CFGLoop* lp = s->as_CFGLoop(); duke@435: assert(lp->_parent == this, "immediate child"); duke@435: for (int k = 0; k < lp->_exits.length(); k++) { duke@435: Block* eb = lp->_exits.at(k).get_target(); duke@435: float prob = lp->_exits.at(k).get_prob(); duke@435: update_succ_freq(eb, freq * prob); duke@435: } duke@435: } duke@435: } duke@435: duke@435: #if 0 duke@435: // Raise frequency of the loop backedge block, in an effort duke@435: // to keep it empty. Skip the method level "loop". duke@435: if (_parent != NULL) { duke@435: CFGElement* s = _members.at(_members.length() - 1); duke@435: if (s->is_block()) { duke@435: Block* bk = s->as_Block(); duke@435: if (bk->_num_succs == 1 && bk->_succs[0] == hd) { duke@435: // almost any value >= 1.0f works duke@435: // FIXME: raw constant duke@435: bk->_freq = 1.05f; duke@435: } duke@435: } duke@435: } duke@435: #endif duke@435: duke@435: // For all loops other than the outer, "method" loop, duke@435: // sum and normalize the exit probability. The "method" loop duke@435: // should keep the initial exit probability of 1, so that duke@435: // inner blocks do not get erroneously scaled. duke@435: if (_depth != 0) { duke@435: // Total the exit probabilities for this loop. duke@435: float exits_sum = 0.0f; duke@435: for (int i = 0; i < _exits.length(); i++) { duke@435: exits_sum += _exits.at(i).get_prob(); duke@435: } duke@435: duke@435: // Normalize the exit probabilities. Until now, the duke@435: // probabilities estimate the possibility of exit per duke@435: // a single loop iteration; afterward, they estimate duke@435: // the probability of exit per loop entry. duke@435: for (int i = 0; i < _exits.length(); i++) { duke@435: Block* et = _exits.at(i).get_target(); duke@435: float new_prob = _exits.at(i).get_prob() / exits_sum; duke@435: BlockProbPair bpp(et, new_prob); duke@435: _exits.at_put(i, bpp); duke@435: } duke@435: duke@435: // Save the total, but guard against unreasoable probability, duke@435: // as the value is used to estimate the loop trip count. duke@435: // An infinite trip count would blur relative block duke@435: // frequencies. duke@435: if (exits_sum > 1.0f) exits_sum = 1.0; duke@435: if (exits_sum < PROB_MIN) exits_sum = PROB_MIN; duke@435: _exit_prob = exits_sum; duke@435: } duke@435: } duke@435: duke@435: //------------------------------succ_prob------------------------------------- duke@435: // Determine the probability of reaching successor 'i' from the receiver block. duke@435: float Block::succ_prob(uint i) { duke@435: int eidx = end_idx(); duke@435: Node *n = _nodes[eidx]; // Get ending Node duke@435: int op = n->is_Mach() ? n->as_Mach()->ideal_Opcode() : n->Opcode(); duke@435: duke@435: // Switch on branch type duke@435: switch( op ) { duke@435: case Op_CountedLoopEnd: duke@435: case Op_If: { duke@435: assert (i < 2, "just checking"); duke@435: // Conditionals pass on only part of their frequency duke@435: float prob = n->as_MachIf()->_prob; duke@435: assert(prob >= 0.0 && prob <= 1.0, "out of range probability"); duke@435: // If succ[i] is the FALSE branch, invert path info duke@435: if( _nodes[i + eidx + 1]->Opcode() == Op_IfFalse ) { duke@435: return 1.0f - prob; // not taken duke@435: } else { duke@435: return prob; // taken duke@435: } duke@435: } duke@435: duke@435: case Op_Jump: duke@435: // Divide the frequency between all successors evenly duke@435: return 1.0f/_num_succs; duke@435: duke@435: case Op_Catch: { duke@435: const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj(); duke@435: if (ci->_con == CatchProjNode::fall_through_index) { duke@435: // Fall-thru path gets the lion's share. duke@435: return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs; duke@435: } else { duke@435: // Presume exceptional paths are equally unlikely duke@435: return PROB_UNLIKELY_MAG(5); duke@435: } duke@435: } duke@435: duke@435: case Op_Root: duke@435: case Op_Goto: duke@435: // Pass frequency straight thru to target duke@435: return 1.0f; duke@435: duke@435: case Op_NeverBranch: duke@435: return 0.0f; duke@435: duke@435: case Op_TailCall: duke@435: case Op_TailJump: duke@435: case Op_Return: duke@435: case Op_Halt: duke@435: case Op_Rethrow: duke@435: // Do not push out freq to root block duke@435: return 0.0f; duke@435: duke@435: default: duke@435: ShouldNotReachHere(); duke@435: } duke@435: duke@435: return 0.0f; duke@435: } duke@435: duke@435: //------------------------------update_succ_freq------------------------------- duke@435: // Update the appropriate frequency associated with block 'b', a succesor of duke@435: // a block in this loop. duke@435: void CFGLoop::update_succ_freq(Block* b, float freq) { duke@435: if (b->_loop == this) { duke@435: if (b == head()) { duke@435: // back branch within the loop duke@435: // Do nothing now, the loop carried frequency will be duke@435: // adjust later in scale_freq(). duke@435: } else { duke@435: // simple branch within the loop duke@435: b->_freq += freq; duke@435: } duke@435: } else if (!in_loop_nest(b)) { duke@435: // branch is exit from this loop duke@435: BlockProbPair bpp(b, freq); duke@435: _exits.append(bpp); duke@435: } else { duke@435: // branch into nested loop duke@435: CFGLoop* ch = b->_loop; duke@435: ch->_freq += freq; duke@435: } duke@435: } duke@435: duke@435: //------------------------------in_loop_nest----------------------------------- duke@435: // Determine if block b is in the receiver's loop nest. duke@435: bool CFGLoop::in_loop_nest(Block* b) { duke@435: int depth = _depth; duke@435: CFGLoop* b_loop = b->_loop; duke@435: int b_depth = b_loop->_depth; duke@435: if (depth == b_depth) { duke@435: return true; duke@435: } duke@435: while (b_depth > depth) { duke@435: b_loop = b_loop->_parent; duke@435: b_depth = b_loop->_depth; duke@435: } duke@435: return b_loop == this; duke@435: } duke@435: duke@435: //------------------------------scale_freq------------------------------------- duke@435: // Scale frequency of loops and blocks by trip counts from outer loops duke@435: // Do a top down traversal of loop tree (visit outer loops first.) duke@435: void CFGLoop::scale_freq() { duke@435: float loop_freq = _freq * trip_count(); duke@435: for (int i = 0; i < _members.length(); i++) { duke@435: CFGElement* s = _members.at(i); duke@435: s->_freq *= loop_freq; duke@435: } duke@435: CFGLoop* ch = _child; duke@435: while (ch != NULL) { duke@435: ch->scale_freq(); duke@435: ch = ch->_sibling; duke@435: } duke@435: } duke@435: duke@435: #ifndef PRODUCT duke@435: //------------------------------dump_tree-------------------------------------- duke@435: void CFGLoop::dump_tree() const { duke@435: dump(); duke@435: if (_child != NULL) _child->dump_tree(); duke@435: if (_sibling != NULL) _sibling->dump_tree(); duke@435: } duke@435: duke@435: //------------------------------dump------------------------------------------- duke@435: void CFGLoop::dump() const { duke@435: for (int i = 0; i < _depth; i++) tty->print(" "); duke@435: tty->print("%s: %d trip_count: %6.0f freq: %6.0f\n", duke@435: _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq); duke@435: for (int i = 0; i < _depth; i++) tty->print(" "); duke@435: tty->print(" members:", _id); duke@435: int k = 0; duke@435: for (int i = 0; i < _members.length(); i++) { duke@435: if (k++ >= 6) { duke@435: tty->print("\n "); duke@435: for (int j = 0; j < _depth+1; j++) tty->print(" "); duke@435: k = 0; duke@435: } duke@435: CFGElement *s = _members.at(i); duke@435: if (s->is_block()) { duke@435: Block *b = s->as_Block(); duke@435: tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq); duke@435: } else { duke@435: CFGLoop* lp = s->as_CFGLoop(); duke@435: tty->print(" L%d(%6.3f)", lp->_id, lp->_freq); duke@435: } duke@435: } duke@435: tty->print("\n"); duke@435: for (int i = 0; i < _depth; i++) tty->print(" "); duke@435: tty->print(" exits: "); duke@435: k = 0; duke@435: for (int i = 0; i < _exits.length(); i++) { duke@435: if (k++ >= 7) { duke@435: tty->print("\n "); duke@435: for (int j = 0; j < _depth+1; j++) tty->print(" "); duke@435: k = 0; duke@435: } duke@435: Block *blk = _exits.at(i).get_target(); duke@435: float prob = _exits.at(i).get_prob(); duke@435: tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100)); duke@435: } duke@435: tty->print("\n"); duke@435: } duke@435: #endif