Commit cffc9ef2 by Dmitry Vyukov Committed by Copybara-Service

absl: speed up Mutex::Lock

Currently Mutex::Lock contains not inlined non-tail call:
TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure
This turns the function into non-leaf with stack frame allocation
and additional register use. Remove this non-tail call to make the function leaf.
Move spin iterations initialization to LockSlow.

Current Lock happy path:

00000000001edc20 <absl::Mutex::Lock()>:
  1edc20:	55                   	push   %rbp
  1edc21:	48 89 e5             	mov    %rsp,%rbp
  1edc24:	53                   	push   %rbx
  1edc25:	50                   	push   %rax
  1edc26:	48 89 fb             	mov    %rdi,%rbx
  1edc29:	48 8b 07             	mov    (%rdi),%rax
  1edc2c:	a8 19                	test   $0x19,%al
  1edc2e:	75 0e                	jne    1edc3e <absl::Mutex::Lock()+0x1e>
  1edc30:	48 89 c1             	mov    %rax,%rcx
  1edc33:	48 83 c9 08          	or     $0x8,%rcx
  1edc37:	f0 48 0f b1 0b       	lock cmpxchg %rcx,(%rbx)
  1edc3c:	74 42                	je     1edc80 <absl::Mutex::Lock()+0x60>
  ... unhappy path ...
  1edc80:	48 83 c4 08          	add    $0x8,%rsp
  1edc84:	5b                   	pop    %rbx
  1edc85:	5d                   	pop    %rbp
  1edc86:	c3                   	ret

New Lock happy path:

00000000001eea80 <absl::Mutex::Lock()>:
  1eea80:	48 8b 07             	mov    (%rdi),%rax
  1eea83:	a8 19                	test   $0x19,%al
  1eea85:	75 0f                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea87:	48 89 c1             	mov    %rax,%rcx
  1eea8a:	48 83 c9 08          	or     $0x8,%rcx
  1eea8e:	f0 48 0f b1 0f       	lock cmpxchg %rcx,(%rdi)
  1eea93:	75 01                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea95:	c3                   	ret
  ... unhappy path ...

PiperOrigin-RevId: 566488042
Change-Id: I62f854b82a322cfb1d42c34f8ed01b4677693fca
parent a5dc018f
...@@ -129,11 +129,12 @@ enum DelayMode { AGGRESSIVE, GENTLE }; ...@@ -129,11 +129,12 @@ enum DelayMode { AGGRESSIVE, GENTLE };
struct ABSL_CACHELINE_ALIGNED MutexGlobals { struct ABSL_CACHELINE_ALIGNED MutexGlobals {
absl::once_flag once; absl::once_flag once;
int spinloop_iterations = 0;
int32_t mutex_sleep_spins[2] = {}; int32_t mutex_sleep_spins[2] = {};
absl::Duration mutex_sleep_time; absl::Duration mutex_sleep_time;
}; };
std::atomic<int> spinloop_iterations{-1};
absl::Duration MeasureTimeToYield() { absl::Duration MeasureTimeToYield() {
absl::Time before = absl::Now(); absl::Time before = absl::Now();
ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)(); ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)();
...@@ -144,12 +145,11 @@ const MutexGlobals& GetMutexGlobals() { ...@@ -144,12 +145,11 @@ const MutexGlobals& GetMutexGlobals() {
ABSL_CONST_INIT static MutexGlobals data; ABSL_CONST_INIT static MutexGlobals data;
absl::base_internal::LowLevelCallOnce(&data.once, [&]() { absl::base_internal::LowLevelCallOnce(&data.once, [&]() {
if (absl::base_internal::NumCPUs() > 1) { if (absl::base_internal::NumCPUs() > 1) {
// If this is multiprocessor, allow spinning. If the mode is // If the mode is aggressive then spin many times before yielding.
// aggressive then spin many times before yielding. If the mode is // If the mode is gentle then spin only a few times before yielding.
// gentle then spin only a few times before yielding. Aggressive spinning // Aggressive spinning is used to ensure that an Unlock() call,
// is used to ensure that an Unlock() call, which must get the spin lock // which must get the spin lock for any thread to make progress gets it
// for any thread to make progress gets it without undue delay. // without undue delay.
data.spinloop_iterations = 1500;
data.mutex_sleep_spins[AGGRESSIVE] = 5000; data.mutex_sleep_spins[AGGRESSIVE] = 5000;
data.mutex_sleep_spins[GENTLE] = 250; data.mutex_sleep_spins[GENTLE] = 250;
data.mutex_sleep_time = absl::Microseconds(10); data.mutex_sleep_time = absl::Microseconds(10);
...@@ -157,7 +157,6 @@ const MutexGlobals& GetMutexGlobals() { ...@@ -157,7 +157,6 @@ const MutexGlobals& GetMutexGlobals() {
// If this a uniprocessor, only yield/sleep. Real-time threads are often // If this a uniprocessor, only yield/sleep. Real-time threads are often
// unable to yield, so the sleep time needs to be long enough to keep // unable to yield, so the sleep time needs to be long enough to keep
// the calling thread asleep until scheduling happens. // the calling thread asleep until scheduling happens.
data.spinloop_iterations = 0;
data.mutex_sleep_spins[AGGRESSIVE] = 0; data.mutex_sleep_spins[AGGRESSIVE] = 0;
data.mutex_sleep_spins[GENTLE] = 0; data.mutex_sleep_spins[GENTLE] = 0;
data.mutex_sleep_time = MeasureTimeToYield() * 5; data.mutex_sleep_time = MeasureTimeToYield() * 5;
...@@ -1487,7 +1486,7 @@ void Mutex::AssertNotHeld() const { ...@@ -1487,7 +1486,7 @@ void Mutex::AssertNotHeld() const {
// Attempt to acquire *mu, and return whether successful. The implementation // Attempt to acquire *mu, and return whether successful. The implementation
// may spin for a short while if the lock cannot be acquired immediately. // may spin for a short while if the lock cannot be acquired immediately.
static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) { static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) {
int c = GetMutexGlobals().spinloop_iterations; int c = spinloop_iterations.load(std::memory_order_relaxed);
do { // do/while somewhat faster on AMD do { // do/while somewhat faster on AMD
intptr_t v = mu->load(std::memory_order_relaxed); intptr_t v = mu->load(std::memory_order_relaxed);
if ((v & (kMuReader | kMuEvent)) != 0) { if ((v & (kMuReader | kMuEvent)) != 0) {
...@@ -1507,11 +1506,12 @@ void Mutex::Lock() { ...@@ -1507,11 +1506,12 @@ void Mutex::Lock() {
GraphId id = DebugOnlyDeadlockCheck(this); GraphId id = DebugOnlyDeadlockCheck(this);
intptr_t v = mu_.load(std::memory_order_relaxed); intptr_t v = mu_.load(std::memory_order_relaxed);
// try fast acquire, then spin loop // try fast acquire, then spin loop
if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 || if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
!mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire, ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
std::memory_order_relaxed)) { v, kMuWriter | v, std::memory_order_acquire,
std::memory_order_relaxed))) {
// try spin acquire, then slow loop // try spin acquire, then slow loop
if (!TryAcquireWithSpinning(&this->mu_)) { if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
this->LockSlow(kExclusive, nullptr, 0); this->LockSlow(kExclusive, nullptr, 0);
} }
} }
...@@ -1746,6 +1746,16 @@ static intptr_t IgnoreWaitingWritersMask(int flag) { ...@@ -1746,6 +1746,16 @@ static intptr_t IgnoreWaitingWritersMask(int flag) {
// Internal version of LockWhen(). See LockSlowWithDeadline() // Internal version of LockWhen(). See LockSlowWithDeadline()
ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond, ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond,
int flags) { int flags) {
if (ABSL_PREDICT_FALSE(spinloop_iterations.load(std::memory_order_relaxed) <
0)) {
if (absl::base_internal::NumCPUs() > 1) {
// If this is multiprocessor, allow spinning.
spinloop_iterations.store(1500, std::memory_order_relaxed);
} else {
// If this a uniprocessor, only yield/sleep.
spinloop_iterations.store(0, std::memory_order_relaxed);
}
}
ABSL_RAW_CHECK( ABSL_RAW_CHECK(
this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags), this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags),
"condition untrue on return from LockSlow"); "condition untrue on return from LockSlow");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment