Commit c45a4393 by Dmitry Vyukov Committed by Copybara-Service

absl:speed up Mutex::[Reader]TryLock

Tidy up Mutex::[Reader]TryLock codegen by outlining slow path
and non-tail function call, and un-unrolling the loop.

Current codegen:
https://gist.githubusercontent.com/dvyukov/a4d353fd71ac873af9332c1340675b60/raw/226537ffa305b25a79ef3a85277fa870fee5191d/gistfile1.txt

New codegen:
https://gist.githubusercontent.com/dvyukov/686a094c5aa357025689764f155e5a29/raw/e3125c1cdb5669fac60faf336e2f60395e29d888/gistfile1.txt

name                                   old cpu/op   new cpu/op   delta
BM_TryLock                             18.0ns ± 0%  17.7ns ± 0%   -1.64%  (p=0.016 n=4+5)
BM_ReaderTryLock/real_time/threads:1   17.9ns ± 0%  17.9ns ± 0%   -0.10%  (p=0.016 n=5+5)
BM_ReaderTryLock/real_time/threads:72  9.61µs ± 8%  8.42µs ± 7%  -12.37%  (p=0.008 n=5+5)

PiperOrigin-RevId: 567006472
Change-Id: Iea0747e71bbf2dc1f00c70a4235203071d795b99
parent adcaae43
...@@ -1582,26 +1582,36 @@ bool Mutex::AwaitCommon(const Condition& cond, KernelTimeout t) { ...@@ -1582,26 +1582,36 @@ bool Mutex::AwaitCommon(const Condition& cond, KernelTimeout t) {
bool Mutex::TryLock() { bool Mutex::TryLock() {
ABSL_TSAN_MUTEX_PRE_LOCK(this, __tsan_mutex_try_lock); ABSL_TSAN_MUTEX_PRE_LOCK(this, __tsan_mutex_try_lock);
intptr_t v = mu_.load(std::memory_order_relaxed); intptr_t v = mu_.load(std::memory_order_relaxed);
if ((v & (kMuWriter | kMuReader | kMuEvent)) == 0 && // try fast acquire // Try fast acquire.
mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire, if (ABSL_PREDICT_TRUE((v & (kMuWriter | kMuReader | kMuEvent)) == 0)) {
std::memory_order_relaxed)) { if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
DebugOnlyLockEnter(this); v, kMuWriter | v, std::memory_order_acquire,
ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0); std::memory_order_relaxed))) {
return true;
}
if ((v & kMuEvent) != 0) { // we're recording events
if ((v & kExclusive->slow_need_zero) == 0 && // try fast acquire
mu_.compare_exchange_strong(
v, (kExclusive->fast_or | v) + kExclusive->fast_add,
std::memory_order_acquire, std::memory_order_relaxed)) {
DebugOnlyLockEnter(this); DebugOnlyLockEnter(this);
PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0); ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
return true; return true;
} else {
PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
} }
} else if (ABSL_PREDICT_FALSE((v & kMuEvent) != 0)) {
// We're recording events.
return TryLockSlow();
}
ABSL_TSAN_MUTEX_POST_LOCK(
this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
return false;
}
ABSL_ATTRIBUTE_NOINLINE bool Mutex::TryLockSlow() {
intptr_t v = mu_.load(std::memory_order_relaxed);
if ((v & kExclusive->slow_need_zero) == 0 && // try fast acquire
mu_.compare_exchange_strong(
v, (kExclusive->fast_or | v) + kExclusive->fast_add,
std::memory_order_acquire, std::memory_order_relaxed)) {
DebugOnlyLockEnter(this);
PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
return true;
} }
PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
ABSL_TSAN_MUTEX_POST_LOCK( ABSL_TSAN_MUTEX_POST_LOCK(
this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0); this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
return false; return false;
...@@ -1611,41 +1621,57 @@ bool Mutex::ReaderTryLock() { ...@@ -1611,41 +1621,57 @@ bool Mutex::ReaderTryLock() {
ABSL_TSAN_MUTEX_PRE_LOCK(this, ABSL_TSAN_MUTEX_PRE_LOCK(this,
__tsan_mutex_read_lock | __tsan_mutex_try_lock); __tsan_mutex_read_lock | __tsan_mutex_try_lock);
intptr_t v = mu_.load(std::memory_order_relaxed); intptr_t v = mu_.load(std::memory_order_relaxed);
// Clang tends to unroll the loop when compiling with optimization.
// But in this case it just unnecessary increases code size.
// If CAS is failing due to contention, the jump cost is negligible.
#if defined(__clang__)
#pragma nounroll
#endif
// The while-loops (here and below) iterate only if the mutex word keeps // The while-loops (here and below) iterate only if the mutex word keeps
// changing (typically because the reader count changes) under the CAS. We // changing (typically because the reader count changes) under the CAS.
// limit the number of attempts to avoid having to think about livelock. // We limit the number of attempts to avoid having to think about livelock.
int loop_limit = 5; for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
while ((v & (kMuWriter | kMuWait | kMuEvent)) == 0 && loop_limit != 0) { if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuWait | kMuEvent)) != 0)) {
if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne, break;
std::memory_order_acquire, }
std::memory_order_relaxed)) { if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
v, (kMuReader | v) + kMuOne, std::memory_order_acquire,
std::memory_order_relaxed))) {
DebugOnlyLockEnter(this); DebugOnlyLockEnter(this);
ABSL_TSAN_MUTEX_POST_LOCK( ABSL_TSAN_MUTEX_POST_LOCK(
this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0); this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
return true; return true;
} }
loop_limit--;
v = mu_.load(std::memory_order_relaxed);
} }
if ((v & kMuEvent) != 0) { // we're recording events if (ABSL_PREDICT_TRUE((v & kMuEvent) == 0)) {
loop_limit = 5; ABSL_TSAN_MUTEX_POST_LOCK(this,
while ((v & kShared->slow_need_zero) == 0 && loop_limit != 0) { __tsan_mutex_read_lock | __tsan_mutex_try_lock |
if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne, __tsan_mutex_try_lock_failed,
std::memory_order_acquire, 0);
std::memory_order_relaxed)) { return false;
DebugOnlyLockEnter(this); }
PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS); // we're recording events
ABSL_TSAN_MUTEX_POST_LOCK( return ReaderTryLockSlow();
this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0); }
return true;
} ABSL_ATTRIBUTE_NOINLINE bool Mutex::ReaderTryLockSlow() {
loop_limit--; intptr_t v = mu_.load(std::memory_order_relaxed);
v = mu_.load(std::memory_order_relaxed); #if defined(__clang__)
} #pragma nounroll
if ((v & kMuEvent) != 0) { #endif
PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED); for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
if ((v & kShared->slow_need_zero) == 0 &&
mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
std::memory_order_acquire,
std::memory_order_relaxed)) {
DebugOnlyLockEnter(this);
PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS);
ABSL_TSAN_MUTEX_POST_LOCK(
this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
return true;
} }
} }
PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED);
ABSL_TSAN_MUTEX_POST_LOCK(this, ABSL_TSAN_MUTEX_POST_LOCK(this,
__tsan_mutex_read_lock | __tsan_mutex_try_lock | __tsan_mutex_read_lock | __tsan_mutex_try_lock |
__tsan_mutex_try_lock_failed, __tsan_mutex_try_lock_failed,
......
...@@ -521,6 +521,10 @@ class ABSL_LOCKABLE Mutex { ...@@ -521,6 +521,10 @@ class ABSL_LOCKABLE Mutex {
int flags) ABSL_ATTRIBUTE_COLD; int flags) ABSL_ATTRIBUTE_COLD;
// slow path release // slow path release
void UnlockSlow(SynchWaitParams* waitp) ABSL_ATTRIBUTE_COLD; void UnlockSlow(SynchWaitParams* waitp) ABSL_ATTRIBUTE_COLD;
// TryLock slow path.
bool TryLockSlow();
// ReaderTryLock slow path.
bool ReaderTryLockSlow();
// Common code between Await() and AwaitWithTimeout/Deadline() // Common code between Await() and AwaitWithTimeout/Deadline()
bool AwaitCommon(const Condition& cond, bool AwaitCommon(const Condition& cond,
synchronization_internal::KernelTimeout t); synchronization_internal::KernelTimeout t);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment