absl:speed up Mutex::[Reader]TryLock

Tidy up Mutex::[Reader]TryLock codegen by outlining slow path and non-tail function call, and un-unrolling the loop. Current codegen: https://gist.githubusercontent.com/dvyukov/a4d353fd71ac873af9332c1340675b60/raw/226537ffa305b25a79ef3a85277fa870fee5191d/gistfile1.txt New codegen: https://gist.githubusercontent.com/dvyukov/686a094c5aa357025689764f155e5a29/raw/e3125c1cdb5669fac60faf336e2f60395e29d888/gistfile1.txt name old cpu/op new cpu/op delta BM_TryLock 18.0ns ± 0% 17.7ns ± 0% -1.64% (p=0.016 n=4+5) BM_ReaderTryLock/real_time/threads:1 17.9ns ± 0% 17.9ns ± 0% -0.10% (p=0.016 n=5+5) BM_ReaderTryLock/real_time/threads:72 9.61µs ± 8% 8.42µs ± 7% -12.37% (p=0.008 n=5+5) PiperOrigin-RevId: 567006472 Change-Id: Iea0747e71bbf2dc1f00c70a4235203071d795b99

absl:speed up Mutex::[Reader]TryLock
Tidy up Mutex::[Reader]TryLock codegen by outlining slow path and non-tail function call, and un-unrolling the loop. Current codegen: https://gist.githubusercontent.com/dvyukov/a4d353fd71ac873af9332c1340675b60/raw/226537ffa305b25a79ef3a85277fa870fee5191d/gistfile1.txt New codegen: https://gist.githubusercontent.com/dvyukov/686a094c5aa357025689764f155e5a29/raw/e3125c1cdb5669fac60faf336e2f60395e29d888/gistfile1.txt name old cpu/op new cpu/op delta BM_TryLock 18.0ns ± 0% 17.7ns ± 0% -1.64% (p=0.016 n=4+5) BM_ReaderTryLock/real_time/threads:1 17.9ns ± 0% 17.9ns ± 0% -0.10% (p=0.016 n=5+5) BM_ReaderTryLock/real_time/threads:72 9.61µs ± 8% 8.42µs ± 7% -12.37% (p=0.008 n=5+5) PiperOrigin-RevId: 567006472 Change-Id: Iea0747e71bbf2dc1f00c70a4235203071d795b99
c45a4393 · Dmitry Vyukov · Copybara-Service · adcaae43 · c45a4393 · c45a4393
Commit c45a4393 authored Sep 20, 2023 by Dmitry Vyukov Committed by Copybara-Service Sep 20, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 71 additions and 41 deletions

absl/synchronization/mutex.cc
+67 -41

absl/synchronization/mutex.h
+4 -0

No files found.
--- a/absl/synchronization/mutex.cc
+++ b/absl/synchronization/mutex.cc
@@ -1582,26 +1582,36 @@ bool Mutex::AwaitCommon(const Condition& cond, KernelTimeout t) {
 bool Mutex::TryLock() {
  ABSL_TSAN_MUTEX_PRE_LOCK(this, __tsan_mutex_try_lock);
  intptr_t v = mu_.load(std::memory_order_relaxed);
-  if ((v & (kMuWriter | kMuReader | kMuEvent)) == 0 &&  // try fast acquire
+  // Try fast acquire.
-      mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire,
+  if (ABSL_PREDICT_TRUE((v & (kMuWriter | kMuReader | kMuEvent)) == 0)) {
-                                  std::memory_order_relaxed)) {
+    if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
-    DebugOnlyLockEnter(this);
+            v, kMuWriter | v, std::memory_order_acquire,
-    ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
+            std::memory_order_relaxed))) {
-    return true;
-  }
-  if ((v & kMuEvent) != 0) {                      // we're recording events
-    if ((v & kExclusive->slow_need_zero) == 0 &&  // try fast acquire
-        mu_.compare_exchange_strong(
-            v, (kExclusive->fast_or | v) + kExclusive->fast_add,
-            std::memory_order_acquire, std::memory_order_relaxed)) {
      DebugOnlyLockEnter(this);
-      PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
      ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
      return true;
-    } else {
-      PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
    }
+  } else if (ABSL_PREDICT_FALSE((v & kMuEvent) != 0)) {
+    // We're recording events.
+    return TryLockSlow();
+  }
+  ABSL_TSAN_MUTEX_POST_LOCK(
+      this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
+  return false;
+}
+ABSL_ATTRIBUTE_NOINLINE bool Mutex::TryLockSlow() {
+  intptr_t v = mu_.load(std::memory_order_relaxed);
+  if ((v & kExclusive->slow_need_zero) == 0 &&  // try fast acquire
+      mu_.compare_exchange_strong(
+          v, (kExclusive->fast_or | v) + kExclusive->fast_add,
+          std::memory_order_acquire, std::memory_order_relaxed)) {
+    DebugOnlyLockEnter(this);
+    PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
+    ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
+    return true;
  }
+  PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
  ABSL_TSAN_MUTEX_POST_LOCK(
      this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
  return false;
@@ -1611,41 +1621,57 @@ bool Mutex::ReaderTryLock() {
  ABSL_TSAN_MUTEX_PRE_LOCK(this,
                           __tsan_mutex_read_lock | __tsan_mutex_try_lock);
  intptr_t v = mu_.load(std::memory_order_relaxed);
+  // Clang tends to unroll the loop when compiling with optimization.
+  // But in this case it just unnecessary increases code size.
+  // If CAS is failing due to contention, the jump cost is negligible.
+#if defined(__clang__)
+#pragma nounroll
+#endif
  // The while-loops (here and below) iterate only if the mutex word keeps
-  // changing (typically because the reader count changes) under the CAS.  We
+  // changing (typically because the reader count changes) under the CAS.
-  // limit the number of attempts to avoid having to think about livelock.
+  // We limit the number of attempts to avoid having to think about livelock.
-  int loop_limit = 5;
+  for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
-  while ((v & (kMuWriter | kMuWait | kMuEvent)) == 0 && loop_limit != 0) {
+    if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuWait | kMuEvent)) != 0)) {
-    if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
+      break;
-                                    std::memory_order_acquire,
+    }
-                                    std::memory_order_relaxed)) {
+    if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
+            v, (kMuReader | v) + kMuOne, std::memory_order_acquire,
+            std::memory_order_relaxed))) {
      DebugOnlyLockEnter(this);
      ABSL_TSAN_MUTEX_POST_LOCK(
          this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
      return true;
    }
-    loop_limit--;
-    v = mu_.load(std::memory_order_relaxed);
  }
-  if ((v & kMuEvent) != 0) {  // we're recording events
+  if (ABSL_PREDICT_TRUE((v & kMuEvent) == 0)) {
-    loop_limit = 5;
+    ABSL_TSAN_MUTEX_POST_LOCK(this,
-    while ((v & kShared->slow_need_zero) == 0 && loop_limit != 0) {
+                              __tsan_mutex_read_lock | __tsan_mutex_try_lock |
-      if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
+                                  __tsan_mutex_try_lock_failed,
-                                      std::memory_order_acquire,
+                              0);
-                                      std::memory_order_relaxed)) {
+    return false;
-        DebugOnlyLockEnter(this);
+  }
-        PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS);
+  // we're recording events
-        ABSL_TSAN_MUTEX_POST_LOCK(
+  return ReaderTryLockSlow();
-            this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
+}
-        return true;
-      }
+ABSL_ATTRIBUTE_NOINLINE bool Mutex::ReaderTryLockSlow() {
-      loop_limit--;
+  intptr_t v = mu_.load(std::memory_order_relaxed);
-      v = mu_.load(std::memory_order_relaxed);
+#if defined(__clang__)
-    }
+#pragma nounroll
-    if ((v & kMuEvent) != 0) {
+#endif
-      PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED);
+  for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
+    if ((v & kShared->slow_need_zero) == 0 &&
+        mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
+                                    std::memory_order_acquire,
+                                    std::memory_order_relaxed)) {
+      DebugOnlyLockEnter(this);
+      PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS);
+      ABSL_TSAN_MUTEX_POST_LOCK(
+          this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
+      return true;
    }
  }
+  PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED);
  ABSL_TSAN_MUTEX_POST_LOCK(this,
                            __tsan_mutex_read_lock | __tsan_mutex_try_lock |
                                __tsan_mutex_try_lock_failed,

--- a/absl/synchronization/mutex.h
+++ b/absl/synchronization/mutex.h
@@ -521,6 +521,10 @@ class ABSL_LOCKABLE Mutex {
                int flags) ABSL_ATTRIBUTE_COLD;
  // slow path release
  void UnlockSlow(SynchWaitParams* waitp) ABSL_ATTRIBUTE_COLD;
+  // TryLock slow path.
+  bool TryLockSlow();
+  // ReaderTryLock slow path.
+  bool ReaderTryLockSlow();
  // Common code between Await() and AwaitWithTimeout/Deadline()
  bool AwaitCommon(const Condition& cond,
                   synchronization_internal::KernelTimeout t);