absl: speed up Mutex::Lock

Currently Mutex::Lock contains not inlined non-tail call: TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure This turns the function into non-leaf with stack frame allocation and additional register use. Remove this non-tail call to make the function leaf. Move spin iterations initialization to LockSlow. Current Lock happy path: 00000000001edc20 <absl::Mutex::Lock()>: 1edc20: 55 push %rbp 1edc21: 48 89 e5 mov %rsp,%rbp 1edc24: 53 push %rbx 1edc25: 50 push %rax 1edc26: 48 89 fb mov %rdi,%rbx 1edc29: 48 8b 07 mov (%rdi),%rax 1edc2c: a8 19 test $0x19,%al 1edc2e: 75 0e jne 1edc3e <absl::Mutex::Lock()+0x1e> 1edc30: 48 89 c1 mov %rax,%rcx 1edc33: 48 83 c9 08 or $0x8,%rcx 1edc37: f0 48 0f b1 0b lock cmpxchg %rcx,(%rbx) 1edc3c: 74 42 je 1edc80 <absl::Mutex::Lock()+0x60> ... unhappy path ... 1edc80: 48 83 c4 08 add $0x8,%rsp 1edc84: 5b pop %rbx 1edc85: 5d pop %rbp 1edc86: c3 ret New Lock happy path: 00000000001eea80 <absl::Mutex::Lock()>: 1eea80: 48 8b 07 mov (%rdi),%rax 1eea83: a8 19 test $0x19,%al 1eea85: 75 0f jne 1eea96 <absl::Mutex::Lock()+0x16> 1eea87: 48 89 c1 mov %rax,%rcx 1eea8a: 48 83 c9 08 or $0x8,%rcx 1eea8e: f0 48 0f b1 0f lock cmpxchg %rcx,(%rdi) 1eea93: 75 01 jne 1eea96 <absl::Mutex::Lock()+0x16> 1eea95: c3 ret ... unhappy path ... PiperOrigin-RevId: 566488042 Change-Id: I62f854b82a322cfb1d42c34f8ed01b4677693fca

absl: speed up Mutex::Lock
Currently Mutex::Lock contains not inlined non-tail call: TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure This turns the function into non-leaf with stack frame allocation and additional register use. Remove this non-tail call to make the function leaf. Move spin iterations initialization to LockSlow. Current Lock happy path: 00000000001edc20 <absl::Mutex::Lock()>: 1edc20: 55 push %rbp 1edc21: 48 89 e5 mov %rsp,%rbp 1edc24: 53 push %rbx 1edc25: 50 push %rax 1edc26: 48 89 fb mov %rdi,%rbx 1edc29: 48 8b 07 mov (%rdi),%rax 1edc2c: a8 19 test $0x19,%al 1edc2e: 75 0e jne 1edc3e <absl::Mutex::Lock()+0x1e> 1edc30: 48 89 c1 mov %rax,%rcx 1edc33: 48 83 c9 08 or $0x8,%rcx 1edc37: f0 48 0f b1 0b lock cmpxchg %rcx,(%rbx) 1edc3c: 74 42 je 1edc80 <absl::Mutex::Lock()+0x60> ... unhappy path ... 1edc80: 48 83 c4 08 add $0x8,%rsp 1edc84: 5b pop %rbx 1edc85: 5d pop %rbp 1edc86: c3 ret New Lock happy path: 00000000001eea80 <absl::Mutex::Lock()>: 1eea80: 48 8b 07 mov (%rdi),%rax 1eea83: a8 19 test $0x19,%al 1eea85: 75 0f jne 1eea96 <absl::Mutex::Lock()+0x16> 1eea87: 48 89 c1 mov %rax,%rcx 1eea8a: 48 83 c9 08 or $0x8,%rcx 1eea8e: f0 48 0f b1 0f lock cmpxchg %rcx,(%rdi) 1eea93: 75 01 jne 1eea96 <absl::Mutex::Lock()+0x16> 1eea95: c3 ret ... unhappy path ... PiperOrigin-RevId: 566488042 Change-Id: I62f854b82a322cfb1d42c34f8ed01b4677693fca
cffc9ef2 · Dmitry Vyukov · Copybara-Service · a5dc018f · cffc9ef2
Commit cffc9ef2 authored Sep 18, 2023 by Dmitry Vyukov Committed by Copybara-Service Sep 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 13 deletions

absl/synchronization/mutex.cc
+23 -13

No files found.
--- a/absl/synchronization/mutex.cc
+++ b/absl/synchronization/mutex.cc
@@ -129,11 +129,12 @@ enum DelayMode { AGGRESSIVE, GENTLE };
 struct ABSL_CACHELINE_ALIGNED MutexGlobals {
  absl::once_flag once;
-  int spinloop_iterations = 0;
  int32_t mutex_sleep_spins[2] = {};
  absl::Duration mutex_sleep_time;
 };
+std::atomic<int> spinloop_iterations{-1};
 absl::Duration MeasureTimeToYield() {
  absl::Time before = absl::Now();
  ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)();
@@ -144,12 +145,11 @@ const MutexGlobals& GetMutexGlobals() {
  ABSL_CONST_INIT static MutexGlobals data;
  absl::base_internal::LowLevelCallOnce(&data.once, [&]() {
    if (absl::base_internal::NumCPUs() > 1) {
-      // If this is multiprocessor, allow spinning. If the mode is
+      // If the mode is aggressive then spin many times before yielding.
-      // aggressive then spin many times before yielding. If the mode is
+      // If the mode is gentle then spin only a few times before yielding.
-      // gentle then spin only a few times before yielding. Aggressive spinning
+      // Aggressive spinning is used to ensure that an Unlock() call,
-      // is used to ensure that an Unlock() call, which must get the spin lock
+      // which must get the spin lock for any thread to make progress gets it
-      // for any thread to make progress gets it without undue delay.
+      // without undue delay.
-      data.spinloop_iterations = 1500;
      data.mutex_sleep_spins[AGGRESSIVE] = 5000;
      data.mutex_sleep_spins[GENTLE] = 250;
      data.mutex_sleep_time = absl::Microseconds(10);
@@ -157,7 +157,6 @@ const MutexGlobals& GetMutexGlobals() {
      // If this a uniprocessor, only yield/sleep. Real-time threads are often
      // unable to yield, so the sleep time needs to be long enough to keep
      // the calling thread asleep until scheduling happens.
-      data.spinloop_iterations = 0;
      data.mutex_sleep_spins[AGGRESSIVE] = 0;
      data.mutex_sleep_spins[GENTLE] = 0;
      data.mutex_sleep_time = MeasureTimeToYield() * 5;
@@ -1487,7 +1486,7 @@ void Mutex::AssertNotHeld() const {
 // Attempt to acquire *mu, and return whether successful.  The implementation
 // may spin for a short while if the lock cannot be acquired immediately.
 static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) {
-  int c = GetMutexGlobals().spinloop_iterations;
+  int c = spinloop_iterations.load(std::memory_order_relaxed);
  do {  // do/while somewhat faster on AMD
    intptr_t v = mu->load(std::memory_order_relaxed);
    if ((v & (kMuReader | kMuEvent)) != 0) {
@@ -1507,11 +1506,12 @@ void Mutex::Lock() {
  GraphId id = DebugOnlyDeadlockCheck(this);
  intptr_t v = mu_.load(std::memory_order_relaxed);
  // try fast acquire, then spin loop
-  if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 ||
+  if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
-      !mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire,
+      ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
-                                   std::memory_order_relaxed)) {
+          v, kMuWriter | v, std::memory_order_acquire,
+          std::memory_order_relaxed))) {
    // try spin acquire, then slow loop
-    if (!TryAcquireWithSpinning(&this->mu_)) {
+    if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
      this->LockSlow(kExclusive, nullptr, 0);
    }
  }
@@ -1746,6 +1746,16 @@ static intptr_t IgnoreWaitingWritersMask(int flag) {
 // Internal version of LockWhen().  See LockSlowWithDeadline()
 ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond,
                                             int flags) {
+  if (ABSL_PREDICT_FALSE(spinloop_iterations.load(std::memory_order_relaxed) <
+                         0)) {
+    if (absl::base_internal::NumCPUs() > 1) {
+      // If this is multiprocessor, allow spinning.
+      spinloop_iterations.store(1500, std::memory_order_relaxed);
+    } else {
+      // If this a uniprocessor, only yield/sleep.
+      spinloop_iterations.store(0, std::memory_order_relaxed);
+    }
+  }
  ABSL_RAW_CHECK(
      this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags),
      "condition untrue on return from LockSlow");