src/cpu/x86/vm/x86_32.ad

Print this page




3354       // objectmonitor pointer by masking off the "2" bit or we can just
3355       // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3356       // field offsets with "-2" to compensate for and annul the low-order tag bit.
3357       //
3358       // I use the latter as it avoids AGI stalls.
3359       // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3360       // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3361       //
3362       #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3363 
3364       // boxReg refers to the on-stack BasicLock in the current frame.
3365       // We'd like to write:
3366       //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
3367       // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
3368       // additional latency as we have another ST in the store buffer that must drain.
3369 
3370       if (EmitSync & 8192) { 
3371          masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
3372          masm.get_thread (scrReg) ; 
3373          masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
3374          masm.movptr(tmpReg, 0);                         // consider: xor vs mov
3375          if (os::is_MP()) { masm.lock(); } 
3376          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3377       } else 
3378       if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
3379          masm.movptr(scrReg, boxReg) ; 
3380          masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
3381 
3382          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3383          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3384             // prefetchw [eax + Offset(_owner)-2]
3385             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3386          }
3387 
3388          if ((EmitSync & 64) == 0) {
3389            // Optimistic form: consider XORL tmpReg,tmpReg
3390            masm.movptr(tmpReg, 0 ) ; 
3391          } else { 
3392            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3393            // Test-And-CAS instead of CAS
3394            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3395            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3396            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3397          }
3398 
3399          // Appears unlocked - try to swing _owner from null to non-null.
3400          // Ideally, I'd manifest "Self" with get_thread and then attempt
3401          // to CAS the register containing Self into m->Owner.
3402          // But we don't have enough registers, so instead we can either try to CAS
3403          // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
3404          // we later store "Self" into m->Owner.  Transiently storing a stack address
3405          // (rsp or the address of the box) into  m->owner is harmless.
3406          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3407          if (os::is_MP()) { masm.lock();  }
3408          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3409          masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
3410          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 


3570       // each other and there's no need for an explicit barrier (fence).
3571       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3572 
3573       masm.get_thread (boxReg) ;
3574       if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
3575         // prefetchw [ebx + Offset(_owner)-2]
3576         masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3577       }
3578 
3579       // Note that we could employ various encoding schemes to reduce
3580       // the number of loads below (currently 4) to just 2 or 3.
3581       // Refer to the comments in synchronizer.cpp.
3582       // In practice the chain of fetches doesn't seem to impact performance, however.
3583       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3584          // Attempt to reduce branch density - AMD's branch predictor.
3585          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3586          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3587          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3588          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3589          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3590          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
3591          masm.jmpb  (DONE_LABEL) ; 
3592       } else { 
3593          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3594          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3595          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3596          masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3597          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3598          masm.jccb  (Assembler::notZero, CheckSucc) ; 
3599          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
3600          masm.jmpb  (DONE_LABEL) ; 
3601       }
3602 
3603       // The Following code fragment (EmitSync & 65536) improves the performance of
3604       // contended applications and contended synchronization microbenchmarks.
3605       // Unfortunately the emission of the code - even though not executed - causes regressions
3606       // in scimark and jetstream, evidently because of $ effects.  Replacing the code
3607       // with an equal number of never-executed NOPs results in the same regression.
3608       // We leave it off by default.
3609 
3610       if ((EmitSync & 65536) != 0) {
3611          Label LSuccess, LGoSlowPath ;
3612 
3613          masm.bind  (CheckSucc) ;
3614 
3615          // Optional pre-test ... it's safe to elide this
3616          if ((EmitSync & 16) == 0) { 
3617             masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3618             masm.jccb  (Assembler::zero, LGoSlowPath) ; 
3619          }


3627          //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3628          // (2) If supported, an explicit MFENCE is appealing.
3629          //     In older IA32 processors MFENCE is slower than lock:add or xchg
3630          //     particularly if the write-buffer is full as might be the case if
3631          //     if stores closely precede the fence or fence-equivalent instruction.
3632          //     In more modern implementations MFENCE appears faster, however.
3633          // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3634          //     The $lines underlying the top-of-stack should be in M-state.
3635          //     The locked add instruction is serializing, of course.
3636          // (4) Use xchg, which is serializing
3637          //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3638          // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3639          //     The integer condition codes will tell us if succ was 0.
3640          //     Since _succ and _owner should reside in the same $line and
3641          //     we just stored into _owner, it's likely that the $line
3642          //     remains in M-state for the lock:orl.
3643          //
3644          // We currently use (3), although it's likely that switching to (2)
3645          // is correct for the future.
3646             
3647          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
3648          if (os::is_MP()) { 
3649             if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
3650               masm.mfence();
3651             } else { 
3652               masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
3653             }
3654          }
3655          // Ratify _succ remains non-null
3656          masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3657          masm.jccb  (Assembler::notZero, LSuccess) ; 
3658 
3659          masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
3660          if (os::is_MP()) { masm.lock(); }
3661          masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3662          masm.jccb  (Assembler::notEqual, LSuccess) ;
3663          // Since we're low on registers we installed rsp as a placeholding in _owner.
3664          // Now install Self over rsp.  This is safe as we're transitioning from
3665          // non-null to non=null
3666          masm.get_thread (boxReg) ;
3667          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;




3354       // objectmonitor pointer by masking off the "2" bit or we can just
3355       // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3356       // field offsets with "-2" to compensate for and annul the low-order tag bit.
3357       //
3358       // I use the latter as it avoids AGI stalls.
3359       // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3360       // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3361       //
3362       #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3363 
3364       // boxReg refers to the on-stack BasicLock in the current frame.
3365       // We'd like to write:
3366       //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
3367       // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
3368       // additional latency as we have another ST in the store buffer that must drain.
3369 
3370       if (EmitSync & 8192) { 
3371          masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
3372          masm.get_thread (scrReg) ; 
3373          masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
3374          masm.movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
3375          if (os::is_MP()) { masm.lock(); } 
3376          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3377       } else 
3378       if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
3379          masm.movptr(scrReg, boxReg) ; 
3380          masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
3381 
3382          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3383          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3384             // prefetchw [eax + Offset(_owner)-2]
3385             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3386          }
3387 
3388          if ((EmitSync & 64) == 0) {
3389            // Optimistic form: consider XORL tmpReg,tmpReg
3390            masm.movptr(tmpReg, NULL_WORD) ; 
3391          } else { 
3392            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3393            // Test-And-CAS instead of CAS
3394            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3395            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3396            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3397          }
3398 
3399          // Appears unlocked - try to swing _owner from null to non-null.
3400          // Ideally, I'd manifest "Self" with get_thread and then attempt
3401          // to CAS the register containing Self into m->Owner.
3402          // But we don't have enough registers, so instead we can either try to CAS
3403          // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
3404          // we later store "Self" into m->Owner.  Transiently storing a stack address
3405          // (rsp or the address of the box) into  m->owner is harmless.
3406          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3407          if (os::is_MP()) { masm.lock();  }
3408          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3409          masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
3410          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 


3570       // each other and there's no need for an explicit barrier (fence).
3571       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3572 
3573       masm.get_thread (boxReg) ;
3574       if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
3575         // prefetchw [ebx + Offset(_owner)-2]
3576         masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3577       }
3578 
3579       // Note that we could employ various encoding schemes to reduce
3580       // the number of loads below (currently 4) to just 2 or 3.
3581       // Refer to the comments in synchronizer.cpp.
3582       // In practice the chain of fetches doesn't seem to impact performance, however.
3583       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3584          // Attempt to reduce branch density - AMD's branch predictor.
3585          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3586          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3587          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3588          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3589          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3590          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3591          masm.jmpb  (DONE_LABEL) ; 
3592       } else { 
3593          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3594          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3595          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3596          masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3597          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3598          masm.jccb  (Assembler::notZero, CheckSucc) ; 
3599          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3600          masm.jmpb  (DONE_LABEL) ; 
3601       }
3602 
3603       // The Following code fragment (EmitSync & 65536) improves the performance of
3604       // contended applications and contended synchronization microbenchmarks.
3605       // Unfortunately the emission of the code - even though not executed - causes regressions
3606       // in scimark and jetstream, evidently because of $ effects.  Replacing the code
3607       // with an equal number of never-executed NOPs results in the same regression.
3608       // We leave it off by default.
3609 
3610       if ((EmitSync & 65536) != 0) {
3611          Label LSuccess, LGoSlowPath ;
3612 
3613          masm.bind  (CheckSucc) ;
3614 
3615          // Optional pre-test ... it's safe to elide this
3616          if ((EmitSync & 16) == 0) { 
3617             masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3618             masm.jccb  (Assembler::zero, LGoSlowPath) ; 
3619          }


3627          //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3628          // (2) If supported, an explicit MFENCE is appealing.
3629          //     In older IA32 processors MFENCE is slower than lock:add or xchg
3630          //     particularly if the write-buffer is full as might be the case if
3631          //     if stores closely precede the fence or fence-equivalent instruction.
3632          //     In more modern implementations MFENCE appears faster, however.
3633          // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3634          //     The $lines underlying the top-of-stack should be in M-state.
3635          //     The locked add instruction is serializing, of course.
3636          // (4) Use xchg, which is serializing
3637          //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3638          // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3639          //     The integer condition codes will tell us if succ was 0.
3640          //     Since _succ and _owner should reside in the same $line and
3641          //     we just stored into _owner, it's likely that the $line
3642          //     remains in M-state for the lock:orl.
3643          //
3644          // We currently use (3), although it's likely that switching to (2)
3645          // is correct for the future.
3646             
3647          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3648          if (os::is_MP()) { 
3649             if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
3650               masm.mfence();
3651             } else { 
3652               masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
3653             }
3654          }
3655          // Ratify _succ remains non-null
3656          masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3657          masm.jccb  (Assembler::notZero, LSuccess) ; 
3658 
3659          masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
3660          if (os::is_MP()) { masm.lock(); }
3661          masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3662          masm.jccb  (Assembler::notEqual, LSuccess) ;
3663          // Since we're low on registers we installed rsp as a placeholding in _owner.
3664          // Now install Self over rsp.  This is safe as we're transitioning from
3665          // non-null to non=null
3666          masm.get_thread (boxReg) ;
3667          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;