1 /*
2 * Copyright 1997-2008 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25 // Portions of code courtesy of Clifford Click
26
27 // Optimization - Graph Style
28
29 #include "incls/_precompiled.incl"
30 #include "incls/_gcm.cpp.incl"
31
32 // To avoid float value underflow
33 #define MIN_BLOCK_FREQUENCY 1.e-35f
34
35 //----------------------------schedule_node_into_block-------------------------
36 // Insert node n into block b. Look for projections of n and make sure they
37 // are in b also.
38 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
39 // Set basic block of n, Add n to b,
40 _bbs.map(n->_idx, b);
41 b->add_inst(n);
42
43 // After Matching, nearly any old Node may have projections trailing it.
44 // These are usually machine-dependent flags. In any case, they might
45 // float to another block below this one. Move them up.
46 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
47 Node* use = n->fast_out(i);
48 if (use->is_Proj()) {
49 Block* buse = _bbs[use->_idx];
50 if (buse != b) { // In wrong block?
51 if (buse != NULL)
52 buse->find_remove(use); // Remove from wrong block
53 _bbs.map(use->_idx, b); // Re-insert in this block
54 b->add_inst(use);
55 }
56 }
57 }
58 }
59
60
61 //------------------------------schedule_pinned_nodes--------------------------
62 // Set the basic block for Nodes pinned into blocks
63 void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) {
64 // Allocate node stack of size C->unique()+8 to avoid frequent realloc
65 GrowableArray <Node *> spstack(C->unique()+8);
66 spstack.push(_root);
67 while ( spstack.is_nonempty() ) {
68 Node *n = spstack.pop();
69 if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited
70 if( n->pinned() && !_bbs.lookup(n->_idx) ) { // Pinned? Nail it down!
71 Node *input = n->in(0);
72 assert( input, "pinned Node must have Control" );
73 while( !input->is_block_start() )
74 input = input->in(0);
75 Block *b = _bbs[input->_idx]; // Basic block of controlling input
76 schedule_node_into_block(n, b);
77 }
78 for( int i = n->req() - 1; i >= 0; --i ) { // For all inputs
79 if( n->in(i) != NULL )
80 spstack.push(n->in(i));
81 }
82 }
83 }
84 }
85
86 #ifdef ASSERT
87 // Assert that new input b2 is dominated by all previous inputs.
88 // Check this by by seeing that it is dominated by b1, the deepest
89 // input observed until b2.
90 static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) {
91 if (b1 == NULL) return;
92 assert(b1->_dom_depth < b2->_dom_depth, "sanity");
93 Block* tmp = b2;
94 while (tmp != b1 && tmp != NULL) {
95 tmp = tmp->_idom;
96 }
97 if (tmp != b1) {
98 // Detected an unschedulable graph. Print some nice stuff and die.
99 tty->print_cr("!!! Unschedulable graph !!!");
100 for (uint j=0; j<n->len(); j++) { // For all inputs
101 Node* inn = n->in(j); // Get input
102 if (inn == NULL) continue; // Ignore NULL, missing inputs
103 Block* inb = bbs[inn->_idx];
104 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
105 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
106 inn->dump();
107 }
108 tty->print("Failing node: ");
109 n->dump();
110 assert(false, "unscheduable graph");
111 }
112 }
113 #endif
114
115 static Block* find_deepest_input(Node* n, Block_Array &bbs) {
116 // Find the last input dominated by all other inputs.
117 Block* deepb = NULL; // Deepest block so far
118 int deepb_dom_depth = 0;
119 for (uint k = 0; k < n->len(); k++) { // For all inputs
120 Node* inn = n->in(k); // Get input
121 if (inn == NULL) continue; // Ignore NULL, missing inputs
122 Block* inb = bbs[inn->_idx];
123 assert(inb != NULL, "must already have scheduled this input");
124 if (deepb_dom_depth < (int) inb->_dom_depth) {
125 // The new inb must be dominated by the previous deepb.
126 // The various inputs must be linearly ordered in the dom
127 // tree, or else there will not be a unique deepest block.
128 DEBUG_ONLY(assert_dom(deepb, inb, n, bbs));
129 deepb = inb; // Save deepest block
130 deepb_dom_depth = deepb->_dom_depth;
131 }
132 }
133 assert(deepb != NULL, "must be at least one input to n");
134 return deepb;
135 }
136
137
138 //------------------------------schedule_early---------------------------------
139 // Find the earliest Block any instruction can be placed in. Some instructions
140 // are pinned into Blocks. Unpinned instructions can appear in last block in
141 // which all their inputs occur.
142 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
143 // Allocate stack with enough space to avoid frequent realloc
144 Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats
145 // roots.push(_root); _root will be processed among C->top() inputs
146 roots.push(C->top());
147 visited.set(C->top()->_idx);
148
149 while (roots.size() != 0) {
150 // Use local variables nstack_top_n & nstack_top_i to cache values
151 // on stack's top.
152 Node *nstack_top_n = roots.pop();
153 uint nstack_top_i = 0;
154 //while_nstack_nonempty:
155 while (true) {
156 // Get parent node and next input's index from stack's top.
157 Node *n = nstack_top_n;
158 uint i = nstack_top_i;
159
160 if (i == 0) {
161 // Special control input processing.
162 // While I am here, go ahead and look for Nodes which are taking control
163 // from a is_block_proj Node. After I inserted RegionNodes to make proper
164 // blocks, the control at a is_block_proj more properly comes from the
165 // Region being controlled by the block_proj Node.
166 const Node *in0 = n->in(0);
167 if (in0 != NULL) { // Control-dependent?
168 const Node *p = in0->is_block_proj();
169 if (p != NULL && p != n) { // Control from a block projection?
170 // Find trailing Region
171 Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block
172 uint j = 0;
173 if (pb->_num_succs != 1) { // More then 1 successor?
174 // Search for successor
175 uint max = pb->_nodes.size();
176 assert( max > 1, "" );
177 uint start = max - pb->_num_succs;
178 // Find which output path belongs to projection
179 for (j = start; j < max; j++) {
180 if( pb->_nodes[j] == in0 )
181 break;
182 }
183 assert( j < max, "must find" );
184 // Change control to match head of successor basic block
185 j -= start;
186 }
187 n->set_req(0, pb->_succs[j]->head());
188 }
189 } else { // n->in(0) == NULL
190 if (n->req() == 1) { // This guy is a constant with NO inputs?
191 n->set_req(0, _root);
192 }
193 }
194 }
195
196 // First, visit all inputs and force them to get a block. If an
197 // input is already in a block we quit following inputs (to avoid
198 // cycles). Instead we put that Node on a worklist to be handled
199 // later (since IT'S inputs may not have a block yet).
200 bool done = true; // Assume all n's inputs will be processed
201 while (i < n->len()) { // For all inputs
202 Node *in = n->in(i); // Get input
203 ++i;
204 if (in == NULL) continue; // Ignore NULL, missing inputs
205 int is_visited = visited.test_set(in->_idx);
206 if (!_bbs.lookup(in->_idx)) { // Missing block selection?
207 if (is_visited) {
208 // assert( !visited.test(in->_idx), "did not schedule early" );
209 return false;
210 }
211 nstack.push(n, i); // Save parent node and next input's index.
212 nstack_top_n = in; // Process current input now.
213 nstack_top_i = 0;
214 done = false; // Not all n's inputs processed.
215 break; // continue while_nstack_nonempty;
216 } else if (!is_visited) { // Input not yet visited?
217 roots.push(in); // Visit this guy later, using worklist
218 }
219 }
220 if (done) {
221 // All of n's inputs have been processed, complete post-processing.
222
223 // Some instructions are pinned into a block. These include Region,
224 // Phi, Start, Return, and other control-dependent instructions and
225 // any projections which depend on them.
226 if (!n->pinned()) {
227 // Set earliest legal block.
228 _bbs.map(n->_idx, find_deepest_input(n, _bbs));
229 }
230
231 if (nstack.is_empty()) {
232 // Finished all nodes on stack.
233 // Process next node on the worklist 'roots'.
234 break;
235 }
236 // Get saved parent node and next input's index.
237 nstack_top_n = nstack.node();
238 nstack_top_i = nstack.index();
239 nstack.pop();
240 } // if (done)
241 } // while (true)
242 } // while (roots.size() != 0)
243 return true;
244 }
245
246 //------------------------------dom_lca----------------------------------------
247 // Find least common ancestor in dominator tree
248 // LCA is a current notion of LCA, to be raised above 'this'.
249 // As a convenient boundary condition, return 'this' if LCA is NULL.
250 // Find the LCA of those two nodes.
251 Block* Block::dom_lca(Block* LCA) {
252 if (LCA == NULL || LCA == this) return this;
253
254 Block* anc = this;
255 while (anc->_dom_depth > LCA->_dom_depth)
256 anc = anc->_idom; // Walk up till anc is as high as LCA
257
258 while (LCA->_dom_depth > anc->_dom_depth)
259 LCA = LCA->_idom; // Walk up till LCA is as high as anc
260
261 while (LCA != anc) { // Walk both up till they are the same
262 LCA = LCA->_idom;
263 anc = anc->_idom;
264 }
265
266 return LCA;
267 }
268
269 //--------------------------raise_LCA_above_use--------------------------------
270 // We are placing a definition, and have been given a def->use edge.
271 // The definition must dominate the use, so move the LCA upward in the
272 // dominator tree to dominate the use. If the use is a phi, adjust
273 // the LCA only with the phi input paths which actually use this def.
274 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) {
275 Block* buse = bbs[use->_idx];
276 if (buse == NULL) return LCA; // Unused killing Projs have no use block
277 if (!use->is_Phi()) return buse->dom_lca(LCA);
278 uint pmax = use->req(); // Number of Phi inputs
279 // Why does not this loop just break after finding the matching input to
280 // the Phi? Well...it's like this. I do not have true def-use/use-def
281 // chains. Means I cannot distinguish, from the def-use direction, which
282 // of many use-defs lead from the same use to the same def. That is, this
283 // Phi might have several uses of the same def. Each use appears in a
284 // different predecessor block. But when I enter here, I cannot distinguish
285 // which use-def edge I should find the predecessor block for. So I find
286 // them all. Means I do a little extra work if a Phi uses the same value
287 // more than once.
288 for (uint j=1; j<pmax; j++) { // For all inputs
289 if (use->in(j) == def) { // Found matching input?
290 Block* pred = bbs[buse->pred(j)->_idx];
291 LCA = pred->dom_lca(LCA);
292 }
293 }
294 return LCA;
295 }
296
297 //----------------------------raise_LCA_above_marks----------------------------
298 // Return a new LCA that dominates LCA and any of its marked predecessors.
299 // Search all my parents up to 'early' (exclusive), looking for predecessors
300 // which are marked with the given index. Return the LCA (in the dom tree)
301 // of all marked blocks. If there are none marked, return the original
302 // LCA.
303 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark,
304 Block* early, Block_Array &bbs) {
305 Block_List worklist;
306 worklist.push(LCA);
307 while (worklist.size() > 0) {
308 Block* mid = worklist.pop();
309 if (mid == early) continue; // stop searching here
310
311 // Test and set the visited bit.
312 if (mid->raise_LCA_visited() == mark) continue; // already visited
313
314 // Don't process the current LCA, otherwise the search may terminate early
315 if (mid != LCA && mid->raise_LCA_mark() == mark) {
316 // Raise the LCA.
317 LCA = mid->dom_lca(LCA);
318 if (LCA == early) break; // stop searching everywhere
319 assert(early->dominates(LCA), "early is high enough");
320 // Resume searching at that point, skipping intermediate levels.
321 worklist.push(LCA);
322 if (LCA == mid)
323 continue; // Don't mark as visited to avoid early termination.
324 } else {
325 // Keep searching through this block's predecessors.
326 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
327 Block* mid_parent = bbs[ mid->pred(j)->_idx ];
328 worklist.push(mid_parent);
329 }
330 }
331 mid->set_raise_LCA_visited(mark);
332 }
333 return LCA;
334 }
335
336 //--------------------------memory_early_block--------------------------------
337 // This is a variation of find_deepest_input, the heart of schedule_early.
338 // Find the "early" block for a load, if we considered only memory and
339 // address inputs, that is, if other data inputs were ignored.
340 //
341 // Because a subset of edges are considered, the resulting block will
342 // be earlier (at a shallower dom_depth) than the true schedule_early
343 // point of the node. We compute this earlier block as a more permissive
344 // site for anti-dependency insertion, but only if subsume_loads is enabled.
345 static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) {
346 Node* base;
347 Node* index;
348 Node* store = load->in(MemNode::Memory);
349 load->as_Mach()->memory_inputs(base, index);
350
351 assert(base != NodeSentinel && index != NodeSentinel,
352 "unexpected base/index inputs");
353
354 Node* mem_inputs[4];
355 int mem_inputs_length = 0;
356 if (base != NULL) mem_inputs[mem_inputs_length++] = base;
357 if (index != NULL) mem_inputs[mem_inputs_length++] = index;
358 if (store != NULL) mem_inputs[mem_inputs_length++] = store;
359
360 // In the comparision below, add one to account for the control input,
361 // which may be null, but always takes up a spot in the in array.
362 if (mem_inputs_length + 1 < (int) load->req()) {
363 // This "load" has more inputs than just the memory, base and index inputs.
364 // For purposes of checking anti-dependences, we need to start
365 // from the early block of only the address portion of the instruction,
366 // and ignore other blocks that may have factored into the wider
367 // schedule_early calculation.
368 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
369
370 Block* deepb = NULL; // Deepest block so far
371 int deepb_dom_depth = 0;
372 for (int i = 0; i < mem_inputs_length; i++) {
373 Block* inb = bbs[mem_inputs[i]->_idx];
374 if (deepb_dom_depth < (int) inb->_dom_depth) {
375 // The new inb must be dominated by the previous deepb.
376 // The various inputs must be linearly ordered in the dom
377 // tree, or else there will not be a unique deepest block.
378 DEBUG_ONLY(assert_dom(deepb, inb, load, bbs));
379 deepb = inb; // Save deepest block
380 deepb_dom_depth = deepb->_dom_depth;
381 }
382 }
383 early = deepb;
384 }
385
386 return early;
387 }
388
389 //--------------------------insert_anti_dependences---------------------------
390 // A load may need to witness memory that nearby stores can overwrite.
391 // For each nearby store, either insert an "anti-dependence" edge
392 // from the load to the store, or else move LCA upward to force the
393 // load to (eventually) be scheduled in a block above the store.
394 //
395 // Do not add edges to stores on distinct control-flow paths;
396 // only add edges to stores which might interfere.
397 //
398 // Return the (updated) LCA. There will not be any possibly interfering
399 // store between the load's "early block" and the updated LCA.
400 // Any stores in the updated LCA will have new precedence edges
401 // back to the load. The caller is expected to schedule the load
402 // in the LCA, in which case the precedence edges will make LCM
403 // preserve anti-dependences. The caller may also hoist the load
404 // above the LCA, if it is not the early block.
405 Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
406 assert(load->needs_anti_dependence_check(), "must be a load of some sort");
407 assert(LCA != NULL, "");
408 DEBUG_ONLY(Block* LCA_orig = LCA);
409
410 // Compute the alias index. Loads and stores with different alias indices
411 // do not need anti-dependence edges.
412 uint load_alias_idx = C->get_alias_index(load->adr_type());
413 #ifdef ASSERT
414 if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
415 (PrintOpto || VerifyAliases ||
416 PrintMiscellaneous && (WizardMode || Verbose))) {
417 // Load nodes should not consume all of memory.
418 // Reporting a bottom type indicates a bug in adlc.
419 // If some particular type of node validly consumes all of memory,
420 // sharpen the preceding "if" to exclude it, so we can catch bugs here.
421 tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory.");
422 load->dump(2);
423 if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, "");
424 }
425 #endif
426 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
427 "String compare is only known 'load' that does not conflict with any stores");
428
429 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),
430 "String equals is a 'load' that does not conflict with any stores");
431
432 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
433 "String indexOf is a 'load' that does not conflict with any stores");
434
435 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
436 "Arrays equals is a 'load' that do not conflict with any stores");
437
438 if (!C->alias_type(load_alias_idx)->is_rewritable()) {
439 // It is impossible to spoil this load by putting stores before it,
440 // because we know that the stores will never update the value
441 // which 'load' must witness.
442 return LCA;
443 }
444
445 node_idx_t load_index = load->_idx;
446
447 // Note the earliest legal placement of 'load', as determined by
448 // by the unique point in the dom tree where all memory effects
449 // and other inputs are first available. (Computed by schedule_early.)
450 // For normal loads, 'early' is the shallowest place (dom graph wise)
451 // to look for anti-deps between this load and any store.
452 Block* early = _bbs[load_index];
453
454 // If we are subsuming loads, compute an "early" block that only considers
455 // memory or address inputs. This block may be different than the
456 // schedule_early block in that it could be at an even shallower depth in the
457 // dominator tree, and allow for a broader discovery of anti-dependences.
458 if (C->subsume_loads()) {
459 early = memory_early_block(load, early, _bbs);
460 }
461
462 ResourceArea *area = Thread::current()->resource_area();
463 Node_List worklist_mem(area); // prior memory state to store
464 Node_List worklist_store(area); // possible-def to explore
465 Node_List worklist_visited(area); // visited mergemem nodes
466 Node_List non_early_stores(area); // all relevant stores outside of early
467 bool must_raise_LCA = false;
468
469 #ifdef TRACK_PHI_INPUTS
470 // %%% This extra checking fails because MergeMem nodes are not GVNed.
471 // Provide "phi_inputs" to check if every input to a PhiNode is from the
472 // original memory state. This indicates a PhiNode for which should not
473 // prevent the load from sinking. For such a block, set_raise_LCA_mark
474 // may be overly conservative.
475 // Mechanism: count inputs seen for each Phi encountered in worklist_store.
476 DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
477 #endif
478
479 // 'load' uses some memory state; look for users of the same state.
480 // Recurse through MergeMem nodes to the stores that use them.
481
482 // Each of these stores is a possible definition of memory
483 // that 'load' needs to use. We need to force 'load'
484 // to occur before each such store. When the store is in
485 // the same block as 'load', we insert an anti-dependence
486 // edge load->store.
487
488 // The relevant stores "nearby" the load consist of a tree rooted
489 // at initial_mem, with internal nodes of type MergeMem.
490 // Therefore, the branches visited by the worklist are of this form:
491 // initial_mem -> (MergeMem ->)* store
492 // The anti-dependence constraints apply only to the fringe of this tree.
493
494 Node* initial_mem = load->in(MemNode::Memory);
495 worklist_store.push(initial_mem);
496 worklist_visited.push(initial_mem);
497 worklist_mem.push(NULL);
498 while (worklist_store.size() > 0) {
499 // Examine a nearby store to see if it might interfere with our load.
500 Node* mem = worklist_mem.pop();
501 Node* store = worklist_store.pop();
502 uint op = store->Opcode();
503
504 // MergeMems do not directly have anti-deps.
505 // Treat them as internal nodes in a forward tree of memory states,
506 // the leaves of which are each a 'possible-def'.
507 if (store == initial_mem // root (exclusive) of tree we are searching
508 || op == Op_MergeMem // internal node of tree we are searching
509 ) {
510 mem = store; // It's not a possibly interfering store.
511 if (store == initial_mem)
512 initial_mem = NULL; // only process initial memory once
513
514 for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
515 store = mem->fast_out(i);
516 if (store->is_MergeMem()) {
517 // Be sure we don't get into combinatorial problems.
518 // (Allow phis to be repeated; they can merge two relevant states.)
519 uint j = worklist_visited.size();
520 for (; j > 0; j--) {
521 if (worklist_visited.at(j-1) == store) break;
522 }
523 if (j > 0) continue; // already on work list; do not repeat
524 worklist_visited.push(store);
525 }
526 worklist_mem.push(mem);
527 worklist_store.push(store);
528 }
529 continue;
530 }
531
532 if (op == Op_MachProj || op == Op_Catch) continue;
533 if (store->needs_anti_dependence_check()) continue; // not really a store
534
535 // Compute the alias index. Loads and stores with different alias
536 // indices do not need anti-dependence edges. Wide MemBar's are
537 // anti-dependent on everything (except immutable memories).
538 const TypePtr* adr_type = store->adr_type();
539 if (!C->can_alias(adr_type, load_alias_idx)) continue;
540
541 // Most slow-path runtime calls do NOT modify Java memory, but
542 // they can block and so write Raw memory.
543 if (store->is_Mach()) {
544 MachNode* mstore = store->as_Mach();
545 if (load_alias_idx != Compile::AliasIdxRaw) {
546 // Check for call into the runtime using the Java calling
547 // convention (and from there into a wrapper); it has no
548 // _method. Can't do this optimization for Native calls because
549 // they CAN write to Java memory.
550 if (mstore->ideal_Opcode() == Op_CallStaticJava) {
551 assert(mstore->is_MachSafePoint(), "");
552 MachSafePointNode* ms = (MachSafePointNode*) mstore;
553 assert(ms->is_MachCallJava(), "");
554 MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
555 if (mcj->_method == NULL) {
556 // These runtime calls do not write to Java visible memory
557 // (other than Raw) and so do not require anti-dependence edges.
558 continue;
559 }
560 }
561 // Same for SafePoints: they read/write Raw but only read otherwise.
562 // This is basically a workaround for SafePoints only defining control
563 // instead of control + memory.
564 if (mstore->ideal_Opcode() == Op_SafePoint)
565 continue;
566 } else {
567 // Some raw memory, such as the load of "top" at an allocation,
568 // can be control dependent on the previous safepoint. See
569 // comments in GraphKit::allocate_heap() about control input.
570 // Inserting an anti-dep between such a safepoint and a use
571 // creates a cycle, and will cause a subsequent failure in
572 // local scheduling. (BugId 4919904)
573 // (%%% How can a control input be a safepoint and not a projection??)
574 if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
575 continue;
576 }
577 }
578
579 // Identify a block that the current load must be above,
580 // or else observe that 'store' is all the way up in the
581 // earliest legal block for 'load'. In the latter case,
582 // immediately insert an anti-dependence edge.
583 Block* store_block = _bbs[store->_idx];
584 assert(store_block != NULL, "unused killing projections skipped above");
585
586 if (store->is_Phi()) {
587 // 'load' uses memory which is one (or more) of the Phi's inputs.
588 // It must be scheduled not before the Phi, but rather before
589 // each of the relevant Phi inputs.
590 //
591 // Instead of finding the LCA of all inputs to a Phi that match 'mem',
592 // we mark each corresponding predecessor block and do a combined
593 // hoisting operation later (raise_LCA_above_marks).
594 //
595 // Do not assert(store_block != early, "Phi merging memory after access")
596 // PhiNode may be at start of block 'early' with backedge to 'early'
597 DEBUG_ONLY(bool found_match = false);
598 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
599 if (store->in(j) == mem) { // Found matching input?
600 DEBUG_ONLY(found_match = true);
601 Block* pred_block = _bbs[store_block->pred(j)->_idx];
602 if (pred_block != early) {
603 // If any predecessor of the Phi matches the load's "early block",
604 // we do not need a precedence edge between the Phi and 'load'
605 // since the load will be forced into a block preceeding the Phi.
606 pred_block->set_raise_LCA_mark(load_index);
607 assert(!LCA_orig->dominates(pred_block) ||
608 early->dominates(pred_block), "early is high enough");
609 must_raise_LCA = true;
610 }
611 }
612 }
613 assert(found_match, "no worklist bug");
614 #ifdef TRACK_PHI_INPUTS
615 #ifdef ASSERT
616 // This assert asks about correct handling of PhiNodes, which may not
617 // have all input edges directly from 'mem'. See BugId 4621264
618 int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
619 // Increment by exactly one even if there are multiple copies of 'mem'
620 // coming into the phi, because we will run this block several times
621 // if there are several copies of 'mem'. (That's how DU iterators work.)
622 phi_inputs.at_put(store->_idx, num_mem_inputs);
623 assert(PhiNode::Input + num_mem_inputs < store->req(),
624 "Expect at least one phi input will not be from original memory state");
625 #endif //ASSERT
626 #endif //TRACK_PHI_INPUTS
627 } else if (store_block != early) {
628 // 'store' is between the current LCA and earliest possible block.
629 // Label its block, and decide later on how to raise the LCA
630 // to include the effect on LCA of this store.
631 // If this store's block gets chosen as the raised LCA, we
632 // will find him on the non_early_stores list and stick him
633 // with a precedence edge.
634 // (But, don't bother if LCA is already raised all the way.)
635 if (LCA != early) {
636 store_block->set_raise_LCA_mark(load_index);
637 must_raise_LCA = true;
638 non_early_stores.push(store);
639 }
640 } else {
641 // Found a possibly-interfering store in the load's 'early' block.
642 // This means 'load' cannot sink at all in the dominator tree.
643 // Add an anti-dep edge, and squeeze 'load' into the highest block.
644 assert(store != load->in(0), "dependence cycle found");
645 if (verify) {
646 assert(store->find_edge(load) != -1, "missing precedence edge");
647 } else {
648 store->add_prec(load);
649 }
650 LCA = early;
651 // This turns off the process of gathering non_early_stores.
652 }
653 }
654 // (Worklist is now empty; all nearby stores have been visited.)
655
656 // Finished if 'load' must be scheduled in its 'early' block.
657 // If we found any stores there, they have already been given
658 // precedence edges.
659 if (LCA == early) return LCA;
660
661 // We get here only if there are no possibly-interfering stores
662 // in the load's 'early' block. Move LCA up above all predecessors
663 // which contain stores we have noted.
664 //
665 // The raised LCA block can be a home to such interfering stores,
666 // but its predecessors must not contain any such stores.
667 //
668 // The raised LCA will be a lower bound for placing the load,
669 // preventing the load from sinking past any block containing
670 // a store that may invalidate the memory state required by 'load'.
671 if (must_raise_LCA)
672 LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs);
673 if (LCA == early) return LCA;
674
675 // Insert anti-dependence edges from 'load' to each store
676 // in the non-early LCA block.
677 // Mine the non_early_stores list for such stores.
678 if (LCA->raise_LCA_mark() == load_index) {
679 while (non_early_stores.size() > 0) {
680 Node* store = non_early_stores.pop();
681 Block* store_block = _bbs[store->_idx];
682 if (store_block == LCA) {
683 // add anti_dependence from store to load in its own block
684 assert(store != load->in(0), "dependence cycle found");
685 if (verify) {
686 assert(store->find_edge(load) != -1, "missing precedence edge");
687 } else {
688 store->add_prec(load);
689 }
690 } else {
691 assert(store_block->raise_LCA_mark() == load_index, "block was marked");
692 // Any other stores we found must be either inside the new LCA
693 // or else outside the original LCA. In the latter case, they
694 // did not interfere with any use of 'load'.
695 assert(LCA->dominates(store_block)
696 || !LCA_orig->dominates(store_block), "no stray stores");
697 }
698 }
699 }
700
701 // Return the highest block containing stores; any stores
702 // within that block have been given anti-dependence edges.
703 return LCA;
704 }
705
706 // This class is used to iterate backwards over the nodes in the graph.
707
708 class Node_Backward_Iterator {
709
710 private:
711 Node_Backward_Iterator();
712
713 public:
714 // Constructor for the iterator
715 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs);
716
717 // Postincrement operator to iterate over the nodes
718 Node *next();
719
720 private:
721 VectorSet &_visited;
722 Node_List &_stack;
723 Block_Array &_bbs;
724 };
725
726 // Constructor for the Node_Backward_Iterator
727 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs )
728 : _visited(visited), _stack(stack), _bbs(bbs) {
729 // The stack should contain exactly the root
730 stack.clear();
731 stack.push(root);
732
733 // Clear the visited bits
734 visited.Clear();
735 }
736
737 // Iterator for the Node_Backward_Iterator
738 Node *Node_Backward_Iterator::next() {
739
740 // If the _stack is empty, then just return NULL: finished.
741 if ( !_stack.size() )
742 return NULL;
743
744 // '_stack' is emulating a real _stack. The 'visit-all-users' loop has been
745 // made stateless, so I do not need to record the index 'i' on my _stack.
746 // Instead I visit all users each time, scanning for unvisited users.
747 // I visit unvisited not-anti-dependence users first, then anti-dependent
748 // children next.
749 Node *self = _stack.pop();
750
751 // I cycle here when I am entering a deeper level of recursion.
752 // The key variable 'self' was set prior to jumping here.
753 while( 1 ) {
754
755 _visited.set(self->_idx);
756
757 // Now schedule all uses as late as possible.
758 uint src = self->is_Proj() ? self->in(0)->_idx : self->_idx;
759 uint src_rpo = _bbs[src]->_rpo;
760
761 // Schedule all nodes in a post-order visit
762 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
763
764 // Scan for unvisited nodes
765 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
766 // For all uses, schedule late
767 Node* n = self->fast_out(i); // Use
768
769 // Skip already visited children
770 if ( _visited.test(n->_idx) )
771 continue;
772
773 // do not traverse backward control edges
774 Node *use = n->is_Proj() ? n->in(0) : n;
775 uint use_rpo = _bbs[use->_idx]->_rpo;
776
777 if ( use_rpo < src_rpo )
778 continue;
779
780 // Phi nodes always precede uses in a basic block
781 if ( use_rpo == src_rpo && use->is_Phi() )
782 continue;
783
784 unvisited = n; // Found unvisited
785
786 // Check for possible-anti-dependent
787 if( !n->needs_anti_dependence_check() )
788 break; // Not visited, not anti-dep; schedule it NOW
789 }
790
791 // Did I find an unvisited not-anti-dependent Node?
792 if ( !unvisited )
793 break; // All done with children; post-visit 'self'
794
795 // Visit the unvisited Node. Contains the obvious push to
796 // indicate I'm entering a deeper level of recursion. I push the
797 // old state onto the _stack and set a new state and loop (recurse).
798 _stack.push(self);
799 self = unvisited;
800 } // End recursion loop
801
802 return self;
803 }
804
805 //------------------------------ComputeLatenciesBackwards----------------------
806 // Compute the latency of all the instructions.
807 void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) {
808 #ifndef PRODUCT
809 if (trace_opto_pipelining())
810 tty->print("\n#---- ComputeLatenciesBackwards ----\n");
811 #endif
812
813 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
814 Node *n;
815
816 // Walk over all the nodes from last to first
817 while (n = iter.next()) {
818 // Set the latency for the definitions of this instruction
819 partial_latency_of_defs(n);
820 }
821 } // end ComputeLatenciesBackwards
822
823 //------------------------------partial_latency_of_defs------------------------
824 // Compute the latency impact of this node on all defs. This computes
825 // a number that increases as we approach the beginning of the routine.
826 void PhaseCFG::partial_latency_of_defs(Node *n) {
827 // Set the latency for this instruction
828 #ifndef PRODUCT
829 if (trace_opto_pipelining()) {
830 tty->print("# latency_to_inputs: node_latency[%d] = %d for node",
831 n->_idx, _node_latency.at_grow(n->_idx));
832 dump();
833 }
834 #endif
835
836 if (n->is_Proj())
837 n = n->in(0);
838
839 if (n->is_Root())
840 return;
841
842 uint nlen = n->len();
843 uint use_latency = _node_latency.at_grow(n->_idx);
844 uint use_pre_order = _bbs[n->_idx]->_pre_order;
845
846 for ( uint j=0; j<nlen; j++ ) {
847 Node *def = n->in(j);
848
849 if (!def || def == n)
850 continue;
851
852 // Walk backwards thru projections
853 if (def->is_Proj())
854 def = def->in(0);
855
856 #ifndef PRODUCT
857 if (trace_opto_pipelining()) {
858 tty->print("# in(%2d): ", j);
859 def->dump();
860 }
861 #endif
862
863 // If the defining block is not known, assume it is ok
864 Block *def_block = _bbs[def->_idx];
865 uint def_pre_order = def_block ? def_block->_pre_order : 0;
866
867 if ( (use_pre_order < def_pre_order) ||
868 (use_pre_order == def_pre_order && n->is_Phi()) )
869 continue;
870
871 uint delta_latency = n->latency(j);
872 uint current_latency = delta_latency + use_latency;
873
874 if (_node_latency.at_grow(def->_idx) < current_latency) {
875 _node_latency.at_put_grow(def->_idx, current_latency);
876 }
877
878 #ifndef PRODUCT
879 if (trace_opto_pipelining()) {
880 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d",
881 use_latency, j, delta_latency, current_latency, def->_idx,
882 _node_latency.at_grow(def->_idx));
883 }
884 #endif
885 }
886 }
887
888 //------------------------------latency_from_use-------------------------------
889 // Compute the latency of a specific use
890 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
891 // If self-reference, return no latency
892 if (use == n || use->is_Root())
893 return 0;
894
895 uint def_pre_order = _bbs[def->_idx]->_pre_order;
896 uint latency = 0;
897
898 // If the use is not a projection, then it is simple...
899 if (!use->is_Proj()) {
900 #ifndef PRODUCT
901 if (trace_opto_pipelining()) {
902 tty->print("# out(): ");
903 use->dump();
904 }
905 #endif
906
907 uint use_pre_order = _bbs[use->_idx]->_pre_order;
908
909 if (use_pre_order < def_pre_order)
910 return 0;
911
912 if (use_pre_order == def_pre_order && use->is_Phi())
913 return 0;
914
915 uint nlen = use->len();
916 uint nl = _node_latency.at_grow(use->_idx);
917
918 for ( uint j=0; j<nlen; j++ ) {
919 if (use->in(j) == n) {
920 // Change this if we want local latencies
921 uint ul = use->latency(j);
922 uint l = ul + nl;
923 if (latency < l) latency = l;
924 #ifndef PRODUCT
925 if (trace_opto_pipelining()) {
926 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d",
927 nl, j, ul, l, latency);
928 }
929 #endif
930 }
931 }
932 } else {
933 // This is a projection, just grab the latency of the use(s)
934 for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {
935 uint l = latency_from_use(use, def, use->fast_out(j));
936 if (latency < l) latency = l;
937 }
938 }
939
940 return latency;
941 }
942
943 //------------------------------latency_from_uses------------------------------
944 // Compute the latency of this instruction relative to all of it's uses.
945 // This computes a number that increases as we approach the beginning of the
946 // routine.
947 void PhaseCFG::latency_from_uses(Node *n) {
948 // Set the latency for this instruction
949 #ifndef PRODUCT
950 if (trace_opto_pipelining()) {
951 tty->print("# latency_from_outputs: node_latency[%d] = %d for node",
952 n->_idx, _node_latency.at_grow(n->_idx));
953 dump();
954 }
955 #endif
956 uint latency=0;
957 const Node *def = n->is_Proj() ? n->in(0): n;
958
959 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
960 uint l = latency_from_use(n, def, n->fast_out(i));
961
962 if (latency < l) latency = l;
963 }
964
965 _node_latency.at_put_grow(n->_idx, latency);
966 }
967
968 //------------------------------hoist_to_cheaper_block-------------------------
969 // Pick a block for node self, between early and LCA, that is a cheaper
970 // alternative to LCA.
971 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
972 const double delta = 1+PROB_UNLIKELY_MAG(4);
973 Block* least = LCA;
974 double least_freq = least->_freq;
975 uint target = _node_latency.at_grow(self->_idx);
976 uint start_latency = _node_latency.at_grow(LCA->_nodes[0]->_idx);
977 uint end_latency = _node_latency.at_grow(LCA->_nodes[LCA->end_idx()]->_idx);
978 bool in_latency = (target <= start_latency);
979 const Block* root_block = _bbs[_root->_idx];
980
981 // Turn off latency scheduling if scheduling is just plain off
982 if (!C->do_scheduling())
983 in_latency = true;
984
985 // Do not hoist (to cover latency) instructions which target a
986 // single register. Hoisting stretches the live range of the
987 // single register and may force spilling.
988 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
989 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
990 in_latency = true;
991
992 #ifndef PRODUCT
993 if (trace_opto_pipelining()) {
994 tty->print("# Find cheaper block for latency %d: ",
995 _node_latency.at_grow(self->_idx));
996 self->dump();
997 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
998 LCA->_pre_order,
999 LCA->_nodes[0]->_idx,
1000 start_latency,
1001 LCA->_nodes[LCA->end_idx()]->_idx,
1002 end_latency,
1003 least_freq);
1004 }
1005 #endif
1006
1007 // Walk up the dominator tree from LCA (Lowest common ancestor) to
1008 // the earliest legal location. Capture the least execution frequency.
1009 while (LCA != early) {
1010 LCA = LCA->_idom; // Follow up the dominator tree
1011
1012 if (LCA == NULL) {
1013 // Bailout without retry
1014 C->record_method_not_compilable("late schedule failed: LCA == NULL");
1015 return least;
1016 }
1017
1018 // Don't hoist machine instructions to the root basic block
1019 if (mach && LCA == root_block)
1020 break;
1021
1022 uint start_lat = _node_latency.at_grow(LCA->_nodes[0]->_idx);
1023 uint end_idx = LCA->end_idx();
1024 uint end_lat = _node_latency.at_grow(LCA->_nodes[end_idx]->_idx);
1025 double LCA_freq = LCA->_freq;
1026 #ifndef PRODUCT
1027 if (trace_opto_pipelining()) {
1028 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1029 LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq);
1030 }
1031 #endif
1032 if (LCA_freq < least_freq || // Better Frequency
1033 ( !in_latency && // No block containing latency
1034 LCA_freq < least_freq * delta && // No worse frequency
1035 target >= end_lat && // within latency range
1036 !self->is_iteratively_computed() ) // But don't hoist IV increments
1037 // because they may end up above other uses of their phi forcing
1038 // their result register to be different from their input.
1039 ) {
1040 least = LCA; // Found cheaper block
1041 least_freq = LCA_freq;
1042 start_latency = start_lat;
1043 end_latency = end_lat;
1044 if (target <= start_lat)
1045 in_latency = true;
1046 }
1047 }
1048
1049 #ifndef PRODUCT
1050 if (trace_opto_pipelining()) {
1051 tty->print_cr("# Choose block B%d with start latency=%d and freq=%g",
1052 least->_pre_order, start_latency, least_freq);
1053 }
1054 #endif
1055
1056 // See if the latency needs to be updated
1057 if (target < end_latency) {
1058 #ifndef PRODUCT
1059 if (trace_opto_pipelining()) {
1060 tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1061 }
1062 #endif
1063 _node_latency.at_put_grow(self->_idx, end_latency);
1064 partial_latency_of_defs(self);
1065 }
1066
1067 return least;
1068 }
1069
1070
1071 //------------------------------schedule_late-----------------------------------
1072 // Now schedule all codes as LATE as possible. This is the LCA in the
1073 // dominator tree of all USES of a value. Pick the block with the least
1074 // loop nesting depth that is lowest in the dominator tree.
1075 extern const char must_clone[];
1076 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1077 #ifndef PRODUCT
1078 if (trace_opto_pipelining())
1079 tty->print("\n#---- schedule_late ----\n");
1080 #endif
1081
1082 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
1083 Node *self;
1084
1085 // Walk over all the nodes from last to first
1086 while (self = iter.next()) {
1087 Block* early = _bbs[self->_idx]; // Earliest legal placement
1088
1089 if (self->is_top()) {
1090 // Top node goes in bb #2 with other constants.
1091 // It must be special-cased, because it has no out edges.
1092 early->add_inst(self);
1093 continue;
1094 }
1095
1096 // No uses, just terminate
1097 if (self->outcnt() == 0) {
1098 assert(self->Opcode() == Op_MachProj, "sanity");
1099 continue; // Must be a dead machine projection
1100 }
1101
1102 // If node is pinned in the block, then no scheduling can be done.
1103 if( self->pinned() ) // Pinned in block?
1104 continue;
1105
1106 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1107 if (mach) {
1108 switch (mach->ideal_Opcode()) {
1109 case Op_CreateEx:
1110 // Don't move exception creation
1111 early->add_inst(self);
1112 continue;
1113 break;
1114 case Op_CheckCastPP:
1115 // Don't move CheckCastPP nodes away from their input, if the input
1116 // is a rawptr (5071820).
1117 Node *def = self->in(1);
1118 if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1119 early->add_inst(self);
1120 continue;
1121 }
1122 break;
1123 }
1124 }
1125
1126 // Gather LCA of all uses
1127 Block *LCA = NULL;
1128 {
1129 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1130 // For all uses, find LCA
1131 Node* use = self->fast_out(i);
1132 LCA = raise_LCA_above_use(LCA, use, self, _bbs);
1133 }
1134 } // (Hide defs of imax, i from rest of block.)
1135
1136 // Place temps in the block of their use. This isn't a
1137 // requirement for correctness but it reduces useless
1138 // interference between temps and other nodes.
1139 if (mach != NULL && mach->is_MachTemp()) {
1140 _bbs.map(self->_idx, LCA);
1141 LCA->add_inst(self);
1142 continue;
1143 }
1144
1145 // Check if 'self' could be anti-dependent on memory
1146 if (self->needs_anti_dependence_check()) {
1147 // Hoist LCA above possible-defs and insert anti-dependences to
1148 // defs in new LCA block.
1149 LCA = insert_anti_dependences(LCA, self);
1150 }
1151
1152 if (early->_dom_depth > LCA->_dom_depth) {
1153 // Somehow the LCA has moved above the earliest legal point.
1154 // (One way this can happen is via memory_early_block.)
1155 if (C->subsume_loads() == true && !C->failing()) {
1156 // Retry with subsume_loads == false
1157 // If this is the first failure, the sentinel string will "stick"
1158 // to the Compile object, and the C2Compiler will see it and retry.
1159 C->record_failure(C2Compiler::retry_no_subsuming_loads());
1160 } else {
1161 // Bailout without retry when (early->_dom_depth > LCA->_dom_depth)
1162 C->record_method_not_compilable("late schedule failed: incorrect graph");
1163 }
1164 return;
1165 }
1166
1167 // If there is no opportunity to hoist, then we're done.
1168 bool try_to_hoist = (LCA != early);
1169
1170 // Must clone guys stay next to use; no hoisting allowed.
1171 // Also cannot hoist guys that alter memory or are otherwise not
1172 // allocatable (hoisting can make a value live longer, leading to
1173 // anti and output dependency problems which are normally resolved
1174 // by the register allocator giving everyone a different register).
1175 if (mach != NULL && must_clone[mach->ideal_Opcode()])
1176 try_to_hoist = false;
1177
1178 Block* late = NULL;
1179 if (try_to_hoist) {
1180 // Now find the block with the least execution frequency.
1181 // Start at the latest schedule and work up to the earliest schedule
1182 // in the dominator tree. Thus the Node will dominate all its uses.
1183 late = hoist_to_cheaper_block(LCA, early, self);
1184 } else {
1185 // Just use the LCA of the uses.
1186 late = LCA;
1187 }
1188
1189 // Put the node into target block
1190 schedule_node_into_block(self, late);
1191
1192 #ifdef ASSERT
1193 if (self->needs_anti_dependence_check()) {
1194 // since precedence edges are only inserted when we're sure they
1195 // are needed make sure that after placement in a block we don't
1196 // need any new precedence edges.
1197 verify_anti_dependences(late, self);
1198 }
1199 #endif
1200 } // Loop until all nodes have been visited
1201
1202 } // end ScheduleLate
1203
1204 //------------------------------GlobalCodeMotion-------------------------------
1205 void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) {
1206 ResourceMark rm;
1207
1208 #ifndef PRODUCT
1209 if (trace_opto_pipelining()) {
1210 tty->print("\n---- Start GlobalCodeMotion ----\n");
1211 }
1212 #endif
1213
1214 // Initialize the bbs.map for things on the proj_list
1215 uint i;
1216 for( i=0; i < proj_list.size(); i++ )
1217 _bbs.map(proj_list[i]->_idx, NULL);
1218
1219 // Set the basic block for Nodes pinned into blocks
1220 Arena *a = Thread::current()->resource_area();
1221 VectorSet visited(a);
1222 schedule_pinned_nodes( visited );
1223
1224 // Find the earliest Block any instruction can be placed in. Some
1225 // instructions are pinned into Blocks. Unpinned instructions can
1226 // appear in last block in which all their inputs occur.
1227 visited.Clear();
1228 Node_List stack(a);
1229 stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list
1230 if (!schedule_early(visited, stack)) {
1231 // Bailout without retry
1232 C->record_method_not_compilable("early schedule failed");
1233 return;
1234 }
1235
1236 // Build Def-Use edges.
1237 proj_list.push(_root); // Add real root as another root
1238 proj_list.pop();
1239
1240 // Compute the latency information (via backwards walk) for all the
1241 // instructions in the graph
1242 GrowableArray<uint> node_latency;
1243 _node_latency = node_latency;
1244
1245 if( C->do_scheduling() )
1246 ComputeLatenciesBackwards(visited, stack);
1247
1248 // Now schedule all codes as LATE as possible. This is the LCA in the
1249 // dominator tree of all USES of a value. Pick the block with the least
1250 // loop nesting depth that is lowest in the dominator tree.
1251 // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1252 schedule_late(visited, stack);
1253 if( C->failing() ) {
1254 // schedule_late fails only when graph is incorrect.
1255 assert(!VerifyGraphEdges, "verification should have failed");
1256 return;
1257 }
1258
1259 unique = C->unique();
1260
1261 #ifndef PRODUCT
1262 if (trace_opto_pipelining()) {
1263 tty->print("\n---- Detect implicit null checks ----\n");
1264 }
1265 #endif
1266
1267 // Detect implicit-null-check opportunities. Basically, find NULL checks
1268 // with suitable memory ops nearby. Use the memory op to do the NULL check.
1269 // I can generate a memory op if there is not one nearby.
1270 if (C->is_method_compilation()) {
1271 // Don't do it for natives, adapters, or runtime stubs
1272 int allowed_reasons = 0;
1273 // ...and don't do it when there have been too many traps, globally.
1274 for (int reason = (int)Deoptimization::Reason_none+1;
1275 reason < Compile::trapHistLength; reason++) {
1276 assert(reason < BitsPerInt, "recode bit map");
1277 if (!C->too_many_traps((Deoptimization::DeoptReason) reason))
1278 allowed_reasons |= nth_bit(reason);
1279 }
1280 // By reversing the loop direction we get a very minor gain on mpegaudio.
1281 // Feel free to revert to a forward loop for clarity.
1282 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1283 for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) {
1284 Node *proj = matcher._null_check_tests[i ];
1285 Node *val = matcher._null_check_tests[i+1];
1286 _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons);
1287 // The implicit_null_check will only perform the transformation
1288 // if the null branch is truly uncommon, *and* it leads to an
1289 // uncommon trap. Combined with the too_many_traps guards
1290 // above, this prevents SEGV storms reported in 6366351,
1291 // by recompiling offending methods without this optimization.
1292 }
1293 }
1294
1295 #ifndef PRODUCT
1296 if (trace_opto_pipelining()) {
1297 tty->print("\n---- Start Local Scheduling ----\n");
1298 }
1299 #endif
1300
1301 // Schedule locally. Right now a simple topological sort.
1302 // Later, do a real latency aware scheduler.
1303 int *ready_cnt = NEW_RESOURCE_ARRAY(int,C->unique());
1304 memset( ready_cnt, -1, C->unique() * sizeof(int) );
1305 visited.Clear();
1306 for (i = 0; i < _num_blocks; i++) {
1307 if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) {
1308 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1309 C->record_method_not_compilable("local schedule failed");
1310 }
1311 return;
1312 }
1313 }
1314
1315 // If we inserted any instructions between a Call and his CatchNode,
1316 // clone the instructions on all paths below the Catch.
1317 for( i=0; i < _num_blocks; i++ )
1318 _blocks[i]->call_catch_cleanup(_bbs);
1319
1320 #ifndef PRODUCT
1321 if (trace_opto_pipelining()) {
1322 tty->print("\n---- After GlobalCodeMotion ----\n");
1323 for (uint i = 0; i < _num_blocks; i++) {
1324 _blocks[i]->dump();
1325 }
1326 }
1327 #endif
1328 }
1329
1330
1331 //------------------------------Estimate_Block_Frequency-----------------------
1332 // Estimate block frequencies based on IfNode probabilities.
1333 void PhaseCFG::Estimate_Block_Frequency() {
1334
1335 // Force conditional branches leading to uncommon traps to be unlikely,
1336 // not because we get to the uncommon_trap with less relative frequency,
1337 // but because an uncommon_trap typically causes a deopt, so we only get
1338 // there once.
1339 if (C->do_freq_based_layout()) {
1340 Block_List worklist;
1341 Block* root_blk = _blocks[0];
1342 for (uint i = 1; i < root_blk->num_preds(); i++) {
1343 Block *pb = _bbs[root_blk->pred(i)->_idx];
1344 if (pb->has_uncommon_code()) {
1345 worklist.push(pb);
1346 }
1347 }
1348 while (worklist.size() > 0) {
1349 Block* uct = worklist.pop();
1350 if (uct == _broot) continue;
1351 for (uint i = 1; i < uct->num_preds(); i++) {
1352 Block *pb = _bbs[uct->pred(i)->_idx];
1353 if (pb->_num_succs == 1) {
1354 worklist.push(pb);
1355 } else if (pb->num_fall_throughs() == 2) {
1356 pb->update_uncommon_branch(uct);
1357 }
1358 }
1359 }
1360 }
1361
1362 // Create the loop tree and calculate loop depth.
1363 _root_loop = create_loop_tree();
1364 _root_loop->compute_loop_depth(0);
1365
1366 // Compute block frequency of each block, relative to a single loop entry.
1367 _root_loop->compute_freq();
1368
1369 // Adjust all frequencies to be relative to a single method entry
1370 _root_loop->_freq = 1.0;
1371 _root_loop->scale_freq();
1372
1373 // force paths ending at uncommon traps to be infrequent
1374 if (!C->do_freq_based_layout()) {
1375 Block_List worklist;
1376 Block* root_blk = _blocks[0];
1377 for (uint i = 1; i < root_blk->num_preds(); i++) {
1378 Block *pb = _bbs[root_blk->pred(i)->_idx];
1379 if (pb->has_uncommon_code()) {
1380 worklist.push(pb);
1381 }
1382 }
1383 while (worklist.size() > 0) {
1384 Block* uct = worklist.pop();
1385 uct->_freq = PROB_MIN;
1386 for (uint i = 1; i < uct->num_preds(); i++) {
1387 Block *pb = _bbs[uct->pred(i)->_idx];
1388 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1389 worklist.push(pb);
1390 }
1391 }
1392 }
1393 }
1394
1395 #ifdef ASSERT
1396 for (uint i = 0; i < _num_blocks; i++ ) {
1397 Block *b = _blocks[i];
1398 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requiers meaningful block frequency");
1399 }
1400 #endif
1401
1402 #ifndef PRODUCT
1403 if (PrintCFGBlockFreq) {
1404 tty->print_cr("CFG Block Frequencies");
1405 _root_loop->dump_tree();
1406 if (Verbose) {
1407 tty->print_cr("PhaseCFG dump");
1408 dump();
1409 tty->print_cr("Node dump");
1410 _root->dump(99999);
1411 }
1412 }
1413 #endif
1414 }
1415
1416 //----------------------------create_loop_tree--------------------------------
1417 // Create a loop tree from the CFG
1418 CFGLoop* PhaseCFG::create_loop_tree() {
1419
1420 #ifdef ASSERT
1421 assert( _blocks[0] == _broot, "" );
1422 for (uint i = 0; i < _num_blocks; i++ ) {
1423 Block *b = _blocks[i];
1424 // Check that _loop field are clear...we could clear them if not.
1425 assert(b->_loop == NULL, "clear _loop expected");
1426 // Sanity check that the RPO numbering is reflected in the _blocks array.
1427 // It doesn't have to be for the loop tree to be built, but if it is not,
1428 // then the blocks have been reordered since dom graph building...which
1429 // may question the RPO numbering
1430 assert(b->_rpo == i, "unexpected reverse post order number");
1431 }
1432 #endif
1433
1434 int idct = 0;
1435 CFGLoop* root_loop = new CFGLoop(idct++);
1436
1437 Block_List worklist;
1438
1439 // Assign blocks to loops
1440 for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block
1441 Block *b = _blocks[i];
1442
1443 if (b->head()->is_Loop()) {
1444 Block* loop_head = b;
1445 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1446 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1447 Block* tail = _bbs[tail_n->_idx];
1448
1449 // Defensively filter out Loop nodes for non-single-entry loops.
1450 // For all reasonable loops, the head occurs before the tail in RPO.
1451 if (i <= tail->_rpo) {
1452
1453 // The tail and (recursive) predecessors of the tail
1454 // are made members of a new loop.
1455
1456 assert(worklist.size() == 0, "nonempty worklist");
1457 CFGLoop* nloop = new CFGLoop(idct++);
1458 assert(loop_head->_loop == NULL, "just checking");
1459 loop_head->_loop = nloop;
1460 // Add to nloop so push_pred() will skip over inner loops
1461 nloop->add_member(loop_head);
1462 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs);
1463
1464 while (worklist.size() > 0) {
1465 Block* member = worklist.pop();
1466 if (member != loop_head) {
1467 for (uint j = 1; j < member->num_preds(); j++) {
1468 nloop->push_pred(member, j, worklist, _bbs);
1469 }
1470 }
1471 }
1472 }
1473 }
1474 }
1475
1476 // Create a member list for each loop consisting
1477 // of both blocks and (immediate child) loops.
1478 for (uint i = 0; i < _num_blocks; i++) {
1479 Block *b = _blocks[i];
1480 CFGLoop* lp = b->_loop;
1481 if (lp == NULL) {
1482 // Not assigned to a loop. Add it to the method's pseudo loop.
1483 b->_loop = root_loop;
1484 lp = root_loop;
1485 }
1486 if (lp == root_loop || b != lp->head()) { // loop heads are already members
1487 lp->add_member(b);
1488 }
1489 if (lp != root_loop) {
1490 if (lp->parent() == NULL) {
1491 // Not a nested loop. Make it a child of the method's pseudo loop.
1492 root_loop->add_nested_loop(lp);
1493 }
1494 if (b == lp->head()) {
1495 // Add nested loop to member list of parent loop.
1496 lp->parent()->add_member(lp);
1497 }
1498 }
1499 }
1500
1501 return root_loop;
1502 }
1503
1504 //------------------------------push_pred--------------------------------------
1505 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) {
1506 Node* pred_n = blk->pred(i);
1507 Block* pred = node_to_blk[pred_n->_idx];
1508 CFGLoop *pred_loop = pred->_loop;
1509 if (pred_loop == NULL) {
1510 // Filter out blocks for non-single-entry loops.
1511 // For all reasonable loops, the head occurs before the tail in RPO.
1512 if (pred->_rpo > head()->_rpo) {
1513 pred->_loop = this;
1514 worklist.push(pred);
1515 }
1516 } else if (pred_loop != this) {
1517 // Nested loop.
1518 while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1519 pred_loop = pred_loop->_parent;
1520 }
1521 // Make pred's loop be a child
1522 if (pred_loop->_parent == NULL) {
1523 add_nested_loop(pred_loop);
1524 // Continue with loop entry predecessor.
1525 Block* pred_head = pred_loop->head();
1526 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1527 assert(pred_head != head(), "loop head in only one loop");
1528 push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk);
1529 } else {
1530 assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1531 }
1532 }
1533 }
1534
1535 //------------------------------add_nested_loop--------------------------------
1536 // Make cl a child of the current loop in the loop tree.
1537 void CFGLoop::add_nested_loop(CFGLoop* cl) {
1538 assert(_parent == NULL, "no parent yet");
1539 assert(cl != this, "not my own parent");
1540 cl->_parent = this;
1541 CFGLoop* ch = _child;
1542 if (ch == NULL) {
1543 _child = cl;
1544 } else {
1545 while (ch->_sibling != NULL) { ch = ch->_sibling; }
1546 ch->_sibling = cl;
1547 }
1548 }
1549
1550 //------------------------------compute_loop_depth-----------------------------
1551 // Store the loop depth in each CFGLoop object.
1552 // Recursively walk the children to do the same for them.
1553 void CFGLoop::compute_loop_depth(int depth) {
1554 _depth = depth;
1555 CFGLoop* ch = _child;
1556 while (ch != NULL) {
1557 ch->compute_loop_depth(depth + 1);
1558 ch = ch->_sibling;
1559 }
1560 }
1561
1562 //------------------------------compute_freq-----------------------------------
1563 // Compute the frequency of each block and loop, relative to a single entry
1564 // into the dominating loop head.
1565 void CFGLoop::compute_freq() {
1566 // Bottom up traversal of loop tree (visit inner loops first.)
1567 // Set loop head frequency to 1.0, then transitively
1568 // compute frequency for all successors in the loop,
1569 // as well as for each exit edge. Inner loops are
1570 // treated as single blocks with loop exit targets
1571 // as the successor blocks.
1572
1573 // Nested loops first
1574 CFGLoop* ch = _child;
1575 while (ch != NULL) {
1576 ch->compute_freq();
1577 ch = ch->_sibling;
1578 }
1579 assert (_members.length() > 0, "no empty loops");
1580 Block* hd = head();
1581 hd->_freq = 1.0f;
1582 for (int i = 0; i < _members.length(); i++) {
1583 CFGElement* s = _members.at(i);
1584 float freq = s->_freq;
1585 if (s->is_block()) {
1586 Block* b = s->as_Block();
1587 for (uint j = 0; j < b->_num_succs; j++) {
1588 Block* sb = b->_succs[j];
1589 update_succ_freq(sb, freq * b->succ_prob(j));
1590 }
1591 } else {
1592 CFGLoop* lp = s->as_CFGLoop();
1593 assert(lp->_parent == this, "immediate child");
1594 for (int k = 0; k < lp->_exits.length(); k++) {
1595 Block* eb = lp->_exits.at(k).get_target();
1596 float prob = lp->_exits.at(k).get_prob();
1597 update_succ_freq(eb, freq * prob);
1598 }
1599 }
1600 }
1601
1602 // For all loops other than the outer, "method" loop,
1603 // sum and normalize the exit probability. The "method" loop
1604 // should keep the initial exit probability of 1, so that
1605 // inner blocks do not get erroneously scaled.
1606 if (_depth != 0) {
1607 // Total the exit probabilities for this loop.
1608 float exits_sum = 0.0f;
1609 for (int i = 0; i < _exits.length(); i++) {
1610 exits_sum += _exits.at(i).get_prob();
1611 }
1612
1613 // Normalize the exit probabilities. Until now, the
1614 // probabilities estimate the possibility of exit per
1615 // a single loop iteration; afterward, they estimate
1616 // the probability of exit per loop entry.
1617 for (int i = 0; i < _exits.length(); i++) {
1618 Block* et = _exits.at(i).get_target();
1619 float new_prob = 0.0f;
1620 if (_exits.at(i).get_prob() > 0.0f) {
1621 new_prob = _exits.at(i).get_prob() / exits_sum;
1622 }
1623 BlockProbPair bpp(et, new_prob);
1624 _exits.at_put(i, bpp);
1625 }
1626
1627 // Save the total, but guard against unreasonable probability,
1628 // as the value is used to estimate the loop trip count.
1629 // An infinite trip count would blur relative block
1630 // frequencies.
1631 if (exits_sum > 1.0f) exits_sum = 1.0;
1632 if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;
1633 _exit_prob = exits_sum;
1634 }
1635 }
1636
1637 //------------------------------succ_prob-------------------------------------
1638 // Determine the probability of reaching successor 'i' from the receiver block.
1639 float Block::succ_prob(uint i) {
1640 int eidx = end_idx();
1641 Node *n = _nodes[eidx]; // Get ending Node
1642
1643 int op = n->Opcode();
1644 if (n->is_Mach()) {
1645 if (n->is_MachNullCheck()) {
1646 // Can only reach here if called after lcm. The original Op_If is gone,
1647 // so we attempt to infer the probability from one or both of the
1648 // successor blocks.
1649 assert(_num_succs == 2, "expecting 2 successors of a null check");
1650 // If either successor has only one predecessor, then the
1651 // probabiltity estimate can be derived using the
1652 // relative frequency of the successor and this block.
1653 if (_succs[i]->num_preds() == 2) {
1654 return _succs[i]->_freq / _freq;
1655 } else if (_succs[1-i]->num_preds() == 2) {
1656 return 1 - (_succs[1-i]->_freq / _freq);
1657 } else {
1658 // Estimate using both successor frequencies
1659 float freq = _succs[i]->_freq;
1660 return freq / (freq + _succs[1-i]->_freq);
1661 }
1662 }
1663 op = n->as_Mach()->ideal_Opcode();
1664 }
1665
1666
1667 // Switch on branch type
1668 switch( op ) {
1669 case Op_CountedLoopEnd:
1670 case Op_If: {
1671 assert (i < 2, "just checking");
1672 // Conditionals pass on only part of their frequency
1673 float prob = n->as_MachIf()->_prob;
1674 assert(prob >= 0.0 && prob <= 1.0, "out of range probability");
1675 // If succ[i] is the FALSE branch, invert path info
1676 if( _nodes[i + eidx + 1]->Opcode() == Op_IfFalse ) {
1677 return 1.0f - prob; // not taken
1678 } else {
1679 return prob; // taken
1680 }
1681 }
1682
1683 case Op_Jump:
1684 // Divide the frequency between all successors evenly
1685 return 1.0f/_num_succs;
1686
1687 case Op_Catch: {
1688 const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1689 if (ci->_con == CatchProjNode::fall_through_index) {
1690 // Fall-thru path gets the lion's share.
1691 return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;
1692 } else {
1693 // Presume exceptional paths are equally unlikely
1694 return PROB_UNLIKELY_MAG(5);
1695 }
1696 }
1697
1698 case Op_Root:
1699 case Op_Goto:
1700 // Pass frequency straight thru to target
1701 return 1.0f;
1702
1703 case Op_NeverBranch:
1704 return 0.0f;
1705
1706 case Op_TailCall:
1707 case Op_TailJump:
1708 case Op_Return:
1709 case Op_Halt:
1710 case Op_Rethrow:
1711 // Do not push out freq to root block
1712 return 0.0f;
1713
1714 default:
1715 ShouldNotReachHere();
1716 }
1717
1718 return 0.0f;
1719 }
1720
1721 //------------------------------num_fall_throughs-----------------------------
1722 // Return the number of fall-through candidates for a block
1723 int Block::num_fall_throughs() {
1724 int eidx = end_idx();
1725 Node *n = _nodes[eidx]; // Get ending Node
1726
1727 int op = n->Opcode();
1728 if (n->is_Mach()) {
1729 if (n->is_MachNullCheck()) {
1730 // In theory, either side can fall-thru, for simplicity sake,
1731 // let's say only the false branch can now.
1732 return 1;
1733 }
1734 op = n->as_Mach()->ideal_Opcode();
1735 }
1736
1737 // Switch on branch type
1738 switch( op ) {
1739 case Op_CountedLoopEnd:
1740 case Op_If:
1741 return 2;
1742
1743 case Op_Root:
1744 case Op_Goto:
1745 return 1;
1746
1747 case Op_Catch: {
1748 for (uint i = 0; i < _num_succs; i++) {
1749 const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1750 if (ci->_con == CatchProjNode::fall_through_index) {
1751 return 1;
1752 }
1753 }
1754 return 0;
1755 }
1756
1757 case Op_Jump:
1758 case Op_NeverBranch:
1759 case Op_TailCall:
1760 case Op_TailJump:
1761 case Op_Return:
1762 case Op_Halt:
1763 case Op_Rethrow:
1764 return 0;
1765
1766 default:
1767 ShouldNotReachHere();
1768 }
1769
1770 return 0;
1771 }
1772
1773 //------------------------------succ_fall_through-----------------------------
1774 // Return true if a specific successor could be fall-through target.
1775 bool Block::succ_fall_through(uint i) {
1776 int eidx = end_idx();
1777 Node *n = _nodes[eidx]; // Get ending Node
1778
1779 int op = n->Opcode();
1780 if (n->is_Mach()) {
1781 if (n->is_MachNullCheck()) {
1782 // In theory, either side can fall-thru, for simplicity sake,
1783 // let's say only the false branch can now.
1784 return _nodes[i + eidx + 1]->Opcode() == Op_IfFalse;
1785 }
1786 op = n->as_Mach()->ideal_Opcode();
1787 }
1788
1789 // Switch on branch type
1790 switch( op ) {
1791 case Op_CountedLoopEnd:
1792 case Op_If:
1793 case Op_Root:
1794 case Op_Goto:
1795 return true;
1796
1797 case Op_Catch: {
1798 const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1799 return ci->_con == CatchProjNode::fall_through_index;
1800 }
1801
1802 case Op_Jump:
1803 case Op_NeverBranch:
1804 case Op_TailCall:
1805 case Op_TailJump:
1806 case Op_Return:
1807 case Op_Halt:
1808 case Op_Rethrow:
1809 return false;
1810
1811 default:
1812 ShouldNotReachHere();
1813 }
1814
1815 return false;
1816 }
1817
1818 //------------------------------update_uncommon_branch------------------------
1819 // Update the probability of a two-branch to be uncommon
1820 void Block::update_uncommon_branch(Block* ub) {
1821 int eidx = end_idx();
1822 Node *n = _nodes[eidx]; // Get ending Node
1823
1824 int op = n->as_Mach()->ideal_Opcode();
1825
1826 assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
1827 assert(num_fall_throughs() == 2, "must be a two way branch block");
1828
1829 // Which successor is ub?
1830 uint s;
1831 for (s = 0; s <_num_succs; s++) {
1832 if (_succs[s] == ub) break;
1833 }
1834 assert(s < 2, "uncommon successor must be found");
1835
1836 // If ub is the true path, make the proability small, else
1837 // ub is the false path, and make the probability large
1838 bool invert = (_nodes[s + eidx + 1]->Opcode() == Op_IfFalse);
1839
1840 // Get existing probability
1841 float p = n->as_MachIf()->_prob;
1842
1843 if (invert) p = 1.0 - p;
1844 if (p > PROB_MIN) {
1845 p = PROB_MIN;
1846 }
1847 if (invert) p = 1.0 - p;
1848
1849 n->as_MachIf()->_prob = p;
1850 }
1851
1852 //------------------------------update_succ_freq-------------------------------
1853 // Update the appropriate frequency associated with block 'b', a succesor of
1854 // a block in this loop.
1855 void CFGLoop::update_succ_freq(Block* b, float freq) {
1856 if (b->_loop == this) {
1857 if (b == head()) {
1858 // back branch within the loop
1859 // Do nothing now, the loop carried frequency will be
1860 // adjust later in scale_freq().
1861 } else {
1862 // simple branch within the loop
1863 b->_freq += freq;
1864 }
1865 } else if (!in_loop_nest(b)) {
1866 // branch is exit from this loop
1867 BlockProbPair bpp(b, freq);
1868 _exits.append(bpp);
1869 } else {
1870 // branch into nested loop
1871 CFGLoop* ch = b->_loop;
1872 ch->_freq += freq;
1873 }
1874 }
1875
1876 //------------------------------in_loop_nest-----------------------------------
1877 // Determine if block b is in the receiver's loop nest.
1878 bool CFGLoop::in_loop_nest(Block* b) {
1879 int depth = _depth;
1880 CFGLoop* b_loop = b->_loop;
1881 int b_depth = b_loop->_depth;
1882 if (depth == b_depth) {
1883 return true;
1884 }
1885 while (b_depth > depth) {
1886 b_loop = b_loop->_parent;
1887 b_depth = b_loop->_depth;
1888 }
1889 return b_loop == this;
1890 }
1891
1892 //------------------------------scale_freq-------------------------------------
1893 // Scale frequency of loops and blocks by trip counts from outer loops
1894 // Do a top down traversal of loop tree (visit outer loops first.)
1895 void CFGLoop::scale_freq() {
1896 float loop_freq = _freq * trip_count();
1897 for (int i = 0; i < _members.length(); i++) {
1898 CFGElement* s = _members.at(i);
1899 float block_freq = s->_freq * loop_freq;
1900 if (block_freq < MIN_BLOCK_FREQUENCY) block_freq = MIN_BLOCK_FREQUENCY;
1901 s->_freq = block_freq;
1902 }
1903 CFGLoop* ch = _child;
1904 while (ch != NULL) {
1905 ch->scale_freq();
1906 ch = ch->_sibling;
1907 }
1908 }
1909
1910 #ifndef PRODUCT
1911 //------------------------------dump_tree--------------------------------------
1912 void CFGLoop::dump_tree() const {
1913 dump();
1914 if (_child != NULL) _child->dump_tree();
1915 if (_sibling != NULL) _sibling->dump_tree();
1916 }
1917
1918 //------------------------------dump-------------------------------------------
1919 void CFGLoop::dump() const {
1920 for (int i = 0; i < _depth; i++) tty->print(" ");
1921 tty->print("%s: %d trip_count: %6.0f freq: %6.0f\n",
1922 _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);
1923 for (int i = 0; i < _depth; i++) tty->print(" ");
1924 tty->print(" members:", _id);
1925 int k = 0;
1926 for (int i = 0; i < _members.length(); i++) {
1927 if (k++ >= 6) {
1928 tty->print("\n ");
1929 for (int j = 0; j < _depth+1; j++) tty->print(" ");
1930 k = 0;
1931 }
1932 CFGElement *s = _members.at(i);
1933 if (s->is_block()) {
1934 Block *b = s->as_Block();
1935 tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);
1936 } else {
1937 CFGLoop* lp = s->as_CFGLoop();
1938 tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);
1939 }
1940 }
1941 tty->print("\n");
1942 for (int i = 0; i < _depth; i++) tty->print(" ");
1943 tty->print(" exits: ");
1944 k = 0;
1945 for (int i = 0; i < _exits.length(); i++) {
1946 if (k++ >= 7) {
1947 tty->print("\n ");
1948 for (int j = 0; j < _depth+1; j++) tty->print(" ");
1949 k = 0;
1950 }
1951 Block *blk = _exits.at(i).get_target();
1952 float prob = _exits.at(i).get_prob();
1953 tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));
1954 }
1955 tty->print("\n");
1956 }
1957 #endif