Commit f4106724 by Dmitry Vyukov Committed by Copybara-Service

absl: speed up Mutex::Lock

Currently Mutex::Lock contains not inlined non-tail call:
TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure
This turns the function into non-leaf with stack frame allocation
and additional register use. Remove this non-tail call to make the function leaf.
Move spin iterations initialization to LockSlow.

Current Lock happy path:

00000000001edc20 <absl::Mutex::Lock()>:
  1edc20:	55                   	push   %rbp
  1edc21:	48 89 e5             	mov    %rsp,%rbp
  1edc24:	53                   	push   %rbx
  1edc25:	50                   	push   %rax
  1edc26:	48 89 fb             	mov    %rdi,%rbx
  1edc29:	48 8b 07             	mov    (%rdi),%rax
  1edc2c:	a8 19                	test   $0x19,%al
  1edc2e:	75 0e                	jne    1edc3e <absl::Mutex::Lock()+0x1e>
  1edc30:	48 89 c1             	mov    %rax,%rcx
  1edc33:	48 83 c9 08          	or     $0x8,%rcx
  1edc37:	f0 48 0f b1 0b       	lock cmpxchg %rcx,(%rbx)
  1edc3c:	74 42                	je     1edc80 <absl::Mutex::Lock()+0x60>
  ... unhappy path ...
  1edc80:	48 83 c4 08          	add    $0x8,%rsp
  1edc84:	5b                   	pop    %rbx
  1edc85:	5d                   	pop    %rbp
  1edc86:	c3                   	ret

New Lock happy path:

00000000001eea80 <absl::Mutex::Lock()>:
  1eea80:	48 8b 07             	mov    (%rdi),%rax
  1eea83:	a8 19                	test   $0x19,%al
  1eea85:	75 0f                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea87:	48 89 c1             	mov    %rax,%rcx
  1eea8a:	48 83 c9 08          	or     $0x8,%rcx
  1eea8e:	f0 48 0f b1 0f       	lock cmpxchg %rcx,(%rdi)
  1eea93:	75 01                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea95:	c3                   	ret
  ... unhappy path ...

PiperOrigin-RevId: 577790105
Change-Id: I20793534050302ff9f7a20aed93791c088d98562
parent 89d2caa1
...@@ -129,11 +129,15 @@ enum DelayMode { AGGRESSIVE, GENTLE }; ...@@ -129,11 +129,15 @@ enum DelayMode { AGGRESSIVE, GENTLE };
struct ABSL_CACHELINE_ALIGNED MutexGlobals { struct ABSL_CACHELINE_ALIGNED MutexGlobals {
absl::once_flag once; absl::once_flag once;
int spinloop_iterations = 0; // Note: this variable is initialized separately in Mutex::LockSlow,
// so that Mutex::Lock does not have a stack frame in optimized build.
std::atomic<int> spinloop_iterations{0};
int32_t mutex_sleep_spins[2] = {}; int32_t mutex_sleep_spins[2] = {};
absl::Duration mutex_sleep_time; absl::Duration mutex_sleep_time;
}; };
ABSL_CONST_INIT static MutexGlobals globals;
absl::Duration MeasureTimeToYield() { absl::Duration MeasureTimeToYield() {
absl::Time before = absl::Now(); absl::Time before = absl::Now();
ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)(); ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)();
...@@ -141,33 +145,30 @@ absl::Duration MeasureTimeToYield() { ...@@ -141,33 +145,30 @@ absl::Duration MeasureTimeToYield() {
} }
const MutexGlobals& GetMutexGlobals() { const MutexGlobals& GetMutexGlobals() {
ABSL_CONST_INIT static MutexGlobals data; absl::base_internal::LowLevelCallOnce(&globals.once, [&]() {
absl::base_internal::LowLevelCallOnce(&data.once, [&]() {
if (absl::base_internal::NumCPUs() > 1) { if (absl::base_internal::NumCPUs() > 1) {
// If this is multiprocessor, allow spinning. If the mode is // If the mode is aggressive then spin many times before yielding.
// aggressive then spin many times before yielding. If the mode is // If the mode is gentle then spin only a few times before yielding.
// gentle then spin only a few times before yielding. Aggressive spinning // Aggressive spinning is used to ensure that an Unlock() call,
// is used to ensure that an Unlock() call, which must get the spin lock // which must get the spin lock for any thread to make progress gets it
// for any thread to make progress gets it without undue delay. // without undue delay.
data.spinloop_iterations = 1500; globals.mutex_sleep_spins[AGGRESSIVE] = 5000;
data.mutex_sleep_spins[AGGRESSIVE] = 5000; globals.mutex_sleep_spins[GENTLE] = 250;
data.mutex_sleep_spins[GENTLE] = 250; globals.mutex_sleep_time = absl::Microseconds(10);
data.mutex_sleep_time = absl::Microseconds(10);
} else { } else {
// If this a uniprocessor, only yield/sleep. Real-time threads are often // If this a uniprocessor, only yield/sleep. Real-time threads are often
// unable to yield, so the sleep time needs to be long enough to keep // unable to yield, so the sleep time needs to be long enough to keep
// the calling thread asleep until scheduling happens. // the calling thread asleep until scheduling happens.
data.spinloop_iterations = 0; globals.mutex_sleep_spins[AGGRESSIVE] = 0;
data.mutex_sleep_spins[AGGRESSIVE] = 0; globals.mutex_sleep_spins[GENTLE] = 0;
data.mutex_sleep_spins[GENTLE] = 0; globals.mutex_sleep_time = MeasureTimeToYield() * 5;
data.mutex_sleep_time = MeasureTimeToYield() * 5; globals.mutex_sleep_time =
data.mutex_sleep_time = std::min(globals.mutex_sleep_time, absl::Milliseconds(1));
std::min(data.mutex_sleep_time, absl::Milliseconds(1)); globals.mutex_sleep_time =
data.mutex_sleep_time = std::max(globals.mutex_sleep_time, absl::Microseconds(10));
std::max(data.mutex_sleep_time, absl::Microseconds(10));
} }
}); });
return data; return globals;
} }
} // namespace } // namespace
...@@ -1487,7 +1488,7 @@ void Mutex::AssertNotHeld() const { ...@@ -1487,7 +1488,7 @@ void Mutex::AssertNotHeld() const {
// Attempt to acquire *mu, and return whether successful. The implementation // Attempt to acquire *mu, and return whether successful. The implementation
// may spin for a short while if the lock cannot be acquired immediately. // may spin for a short while if the lock cannot be acquired immediately.
static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) { static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) {
int c = GetMutexGlobals().spinloop_iterations; int c = globals.spinloop_iterations.load(std::memory_order_relaxed);
do { // do/while somewhat faster on AMD do { // do/while somewhat faster on AMD
intptr_t v = mu->load(std::memory_order_relaxed); intptr_t v = mu->load(std::memory_order_relaxed);
if ((v & (kMuReader | kMuEvent)) != 0) { if ((v & (kMuReader | kMuEvent)) != 0) {
...@@ -1507,11 +1508,12 @@ void Mutex::Lock() { ...@@ -1507,11 +1508,12 @@ void Mutex::Lock() {
GraphId id = DebugOnlyDeadlockCheck(this); GraphId id = DebugOnlyDeadlockCheck(this);
intptr_t v = mu_.load(std::memory_order_relaxed); intptr_t v = mu_.load(std::memory_order_relaxed);
// try fast acquire, then spin loop // try fast acquire, then spin loop
if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 || if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
!mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire, ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
std::memory_order_relaxed)) { v, kMuWriter | v, std::memory_order_acquire,
std::memory_order_relaxed))) {
// try spin acquire, then slow loop // try spin acquire, then slow loop
if (!TryAcquireWithSpinning(&this->mu_)) { if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
this->LockSlow(kExclusive, nullptr, 0); this->LockSlow(kExclusive, nullptr, 0);
} }
} }
...@@ -1783,6 +1785,22 @@ static intptr_t IgnoreWaitingWritersMask(int flag) { ...@@ -1783,6 +1785,22 @@ static intptr_t IgnoreWaitingWritersMask(int flag) {
// Internal version of LockWhen(). See LockSlowWithDeadline() // Internal version of LockWhen(). See LockSlowWithDeadline()
ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond, ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond,
int flags) { int flags) {
// Note: we specifically initialize spinloop_iterations after the first use
// in TryAcquireWithSpinning so that Lock function does not have any non-tail
// calls and consequently a stack frame. It's fine to have spinloop_iterations
// uninitialized (meaning no spinning) in all initial uncontended Lock calls
// and in the first contended call. After that we will have
// spinloop_iterations properly initialized.
if (ABSL_PREDICT_FALSE(
globals.spinloop_iterations.load(std::memory_order_relaxed) == 0)) {
if (absl::base_internal::NumCPUs() > 1) {
// If this is multiprocessor, allow spinning.
globals.spinloop_iterations.store(1500, std::memory_order_relaxed);
} else {
// If this a uniprocessor, only yield/sleep.
globals.spinloop_iterations.store(-1, std::memory_order_relaxed);
}
}
ABSL_RAW_CHECK( ABSL_RAW_CHECK(
this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags), this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags),
"condition untrue on return from LockSlow"); "condition untrue on return from LockSlow");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment