3354 // objectmonitor pointer by masking off the "2" bit or we can just
3355 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3356 // field offsets with "-2" to compensate for and annul the low-order tag bit.
3357 //
3358 // I use the latter as it avoids AGI stalls.
3359 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3360 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3361 //
3362 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3363
3364 // boxReg refers to the on-stack BasicLock in the current frame.
3365 // We'd like to write:
3366 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
3367 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
3368 // additional latency as we have another ST in the store buffer that must drain.
3369
3370 if (EmitSync & 8192) {
3371 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3372 masm.get_thread (scrReg) ;
3373 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3374 masm.movptr(tmpReg, 0); // consider: xor vs mov
3375 if (os::is_MP()) { masm.lock(); }
3376 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3377 } else
3378 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
3379 masm.movptr(scrReg, boxReg) ;
3380 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3381
3382 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3383 if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3384 // prefetchw [eax + Offset(_owner)-2]
3385 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3386 }
3387
3388 if ((EmitSync & 64) == 0) {
3389 // Optimistic form: consider XORL tmpReg,tmpReg
3390 masm.movptr(tmpReg, 0 ) ;
3391 } else {
3392 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3393 // Test-And-CAS instead of CAS
3394 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3395 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3396 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3397 }
3398
3399 // Appears unlocked - try to swing _owner from null to non-null.
3400 // Ideally, I'd manifest "Self" with get_thread and then attempt
3401 // to CAS the register containing Self into m->Owner.
3402 // But we don't have enough registers, so instead we can either try to CAS
3403 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
3404 // we later store "Self" into m->Owner. Transiently storing a stack address
3405 // (rsp or the address of the box) into m->owner is harmless.
3406 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3407 if (os::is_MP()) { masm.lock(); }
3408 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3409 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
3410 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3570 // each other and there's no need for an explicit barrier (fence).
3571 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3572
3573 masm.get_thread (boxReg) ;
3574 if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
3575 // prefetchw [ebx + Offset(_owner)-2]
3576 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3577 }
3578
3579 // Note that we could employ various encoding schemes to reduce
3580 // the number of loads below (currently 4) to just 2 or 3.
3581 // Refer to the comments in synchronizer.cpp.
3582 // In practice the chain of fetches doesn't seem to impact performance, however.
3583 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3584 // Attempt to reduce branch density - AMD's branch predictor.
3585 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3586 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3587 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3588 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3589 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3590 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
3591 masm.jmpb (DONE_LABEL) ;
3592 } else {
3593 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3594 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3595 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3596 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3597 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3598 masm.jccb (Assembler::notZero, CheckSucc) ;
3599 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
3600 masm.jmpb (DONE_LABEL) ;
3601 }
3602
3603 // The Following code fragment (EmitSync & 65536) improves the performance of
3604 // contended applications and contended synchronization microbenchmarks.
3605 // Unfortunately the emission of the code - even though not executed - causes regressions
3606 // in scimark and jetstream, evidently because of $ effects. Replacing the code
3607 // with an equal number of never-executed NOPs results in the same regression.
3608 // We leave it off by default.
3609
3610 if ((EmitSync & 65536) != 0) {
3611 Label LSuccess, LGoSlowPath ;
3612
3613 masm.bind (CheckSucc) ;
3614
3615 // Optional pre-test ... it's safe to elide this
3616 if ((EmitSync & 16) == 0) {
3617 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3618 masm.jccb (Assembler::zero, LGoSlowPath) ;
3619 }
3627 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3628 // (2) If supported, an explicit MFENCE is appealing.
3629 // In older IA32 processors MFENCE is slower than lock:add or xchg
3630 // particularly if the write-buffer is full as might be the case if
3631 // if stores closely precede the fence or fence-equivalent instruction.
3632 // In more modern implementations MFENCE appears faster, however.
3633 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3634 // The $lines underlying the top-of-stack should be in M-state.
3635 // The locked add instruction is serializing, of course.
3636 // (4) Use xchg, which is serializing
3637 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3638 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3639 // The integer condition codes will tell us if succ was 0.
3640 // Since _succ and _owner should reside in the same $line and
3641 // we just stored into _owner, it's likely that the $line
3642 // remains in M-state for the lock:orl.
3643 //
3644 // We currently use (3), although it's likely that switching to (2)
3645 // is correct for the future.
3646
3647 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
3648 if (os::is_MP()) {
3649 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
3650 masm.mfence();
3651 } else {
3652 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
3653 }
3654 }
3655 // Ratify _succ remains non-null
3656 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3657 masm.jccb (Assembler::notZero, LSuccess) ;
3658
3659 masm.xorptr(boxReg, boxReg) ; // box is really EAX
3660 if (os::is_MP()) { masm.lock(); }
3661 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3662 masm.jccb (Assembler::notEqual, LSuccess) ;
3663 // Since we're low on registers we installed rsp as a placeholding in _owner.
3664 // Now install Self over rsp. This is safe as we're transitioning from
3665 // non-null to non=null
3666 masm.get_thread (boxReg) ;
3667 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
|
3354 // objectmonitor pointer by masking off the "2" bit or we can just
3355 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3356 // field offsets with "-2" to compensate for and annul the low-order tag bit.
3357 //
3358 // I use the latter as it avoids AGI stalls.
3359 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3360 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3361 //
3362 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3363
3364 // boxReg refers to the on-stack BasicLock in the current frame.
3365 // We'd like to write:
3366 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
3367 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
3368 // additional latency as we have another ST in the store buffer that must drain.
3369
3370 if (EmitSync & 8192) {
3371 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3372 masm.get_thread (scrReg) ;
3373 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3374 masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov
3375 if (os::is_MP()) { masm.lock(); }
3376 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3377 } else
3378 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
3379 masm.movptr(scrReg, boxReg) ;
3380 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3381
3382 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3383 if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3384 // prefetchw [eax + Offset(_owner)-2]
3385 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3386 }
3387
3388 if ((EmitSync & 64) == 0) {
3389 // Optimistic form: consider XORL tmpReg,tmpReg
3390 masm.movptr(tmpReg, NULL_WORD) ;
3391 } else {
3392 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3393 // Test-And-CAS instead of CAS
3394 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3395 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3396 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3397 }
3398
3399 // Appears unlocked - try to swing _owner from null to non-null.
3400 // Ideally, I'd manifest "Self" with get_thread and then attempt
3401 // to CAS the register containing Self into m->Owner.
3402 // But we don't have enough registers, so instead we can either try to CAS
3403 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
3404 // we later store "Self" into m->Owner. Transiently storing a stack address
3405 // (rsp or the address of the box) into m->owner is harmless.
3406 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3407 if (os::is_MP()) { masm.lock(); }
3408 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3409 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
3410 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3570 // each other and there's no need for an explicit barrier (fence).
3571 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3572
3573 masm.get_thread (boxReg) ;
3574 if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
3575 // prefetchw [ebx + Offset(_owner)-2]
3576 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3577 }
3578
3579 // Note that we could employ various encoding schemes to reduce
3580 // the number of loads below (currently 4) to just 2 or 3.
3581 // Refer to the comments in synchronizer.cpp.
3582 // In practice the chain of fetches doesn't seem to impact performance, however.
3583 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3584 // Attempt to reduce branch density - AMD's branch predictor.
3585 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3586 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3587 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3588 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3589 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3590 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3591 masm.jmpb (DONE_LABEL) ;
3592 } else {
3593 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3594 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3595 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3596 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3597 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3598 masm.jccb (Assembler::notZero, CheckSucc) ;
3599 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3600 masm.jmpb (DONE_LABEL) ;
3601 }
3602
3603 // The Following code fragment (EmitSync & 65536) improves the performance of
3604 // contended applications and contended synchronization microbenchmarks.
3605 // Unfortunately the emission of the code - even though not executed - causes regressions
3606 // in scimark and jetstream, evidently because of $ effects. Replacing the code
3607 // with an equal number of never-executed NOPs results in the same regression.
3608 // We leave it off by default.
3609
3610 if ((EmitSync & 65536) != 0) {
3611 Label LSuccess, LGoSlowPath ;
3612
3613 masm.bind (CheckSucc) ;
3614
3615 // Optional pre-test ... it's safe to elide this
3616 if ((EmitSync & 16) == 0) {
3617 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3618 masm.jccb (Assembler::zero, LGoSlowPath) ;
3619 }
3627 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3628 // (2) If supported, an explicit MFENCE is appealing.
3629 // In older IA32 processors MFENCE is slower than lock:add or xchg
3630 // particularly if the write-buffer is full as might be the case if
3631 // if stores closely precede the fence or fence-equivalent instruction.
3632 // In more modern implementations MFENCE appears faster, however.
3633 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3634 // The $lines underlying the top-of-stack should be in M-state.
3635 // The locked add instruction is serializing, of course.
3636 // (4) Use xchg, which is serializing
3637 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3638 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3639 // The integer condition codes will tell us if succ was 0.
3640 // Since _succ and _owner should reside in the same $line and
3641 // we just stored into _owner, it's likely that the $line
3642 // remains in M-state for the lock:orl.
3643 //
3644 // We currently use (3), although it's likely that switching to (2)
3645 // is correct for the future.
3646
3647 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3648 if (os::is_MP()) {
3649 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
3650 masm.mfence();
3651 } else {
3652 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
3653 }
3654 }
3655 // Ratify _succ remains non-null
3656 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3657 masm.jccb (Assembler::notZero, LSuccess) ;
3658
3659 masm.xorptr(boxReg, boxReg) ; // box is really EAX
3660 if (os::is_MP()) { masm.lock(); }
3661 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3662 masm.jccb (Assembler::notEqual, LSuccess) ;
3663 // Since we're low on registers we installed rsp as a placeholding in _owner.
3664 // Now install Self over rsp. This is safe as we're transitioning from
3665 // non-null to non=null
3666 masm.get_thread (boxReg) ;
3667 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
|