Commit 40b2776e by Connal de Souza Committed by Copybara-Service

Optimize GrowIntoSingleGroupShuffleControlBytes.

This implementation is designed to avoid needing to copy to an intermediate buffer and then read from it again, which is an expensive Read-after-Write hazard.

PiperOrigin-RevId: 638071429
Change-Id: I390b4d38b8c1bd7fffba3d403baba6f1511555b0
parent cf071bb3
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "absl/base/attributes.h" #include "absl/base/attributes.h"
#include "absl/base/config.h" #include "absl/base/config.h"
#include "absl/base/dynamic_annotations.h" #include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/optimization.h" #include "absl/base/optimization.h"
#include "absl/container/internal/container_memory.h" #include "absl/container/internal/container_memory.h"
#include "absl/container/internal/hashtablez_sampler.h" #include "absl/container/internal/hashtablez_sampler.h"
...@@ -342,77 +343,126 @@ void ClearBackingArray(CommonFields& c, const PolicyFunctions& policy, ...@@ -342,77 +343,126 @@ void ClearBackingArray(CommonFields& c, const PolicyFunctions& policy,
} }
void HashSetResizeHelper::GrowIntoSingleGroupShuffleControlBytes( void HashSetResizeHelper::GrowIntoSingleGroupShuffleControlBytes(
ctrl_t* new_ctrl, size_t new_capacity) const { ctrl_t* __restrict new_ctrl, size_t new_capacity) const {
assert(is_single_group(new_capacity)); assert(is_single_group(new_capacity));
constexpr size_t kHalfWidth = Group::kWidth / 2; constexpr size_t kHalfWidth = Group::kWidth / 2;
constexpr size_t kQuarterWidth = Group::kWidth / 4;
assert(old_capacity_ < kHalfWidth); assert(old_capacity_ < kHalfWidth);
static_assert(sizeof(uint64_t) >= kHalfWidth,
"Group size is too large. The ctrl bytes for half a group must "
"fit into a uint64_t for this implementation.");
static_assert(sizeof(uint64_t) <= Group::kWidth,
"Group size is too small. The ctrl bytes for a group must "
"cover a uint64_t for this implementation.");
const size_t half_old_capacity = old_capacity_ / 2; const size_t half_old_capacity = old_capacity_ / 2;
// NOTE: operations are done with compile time known size = kHalfWidth. // NOTE: operations are done with compile time known size = kHalfWidth.
// Compiler optimizes that into single ASM operation. // Compiler optimizes that into single ASM operation.
// Copy second half of bytes to the beginning. // Load the bytes from half_old_capacity + 1. This contains the last half of
// We potentially copy more bytes in order to have compile time known size. // old_ctrl bytes, followed by the sentinel byte, and then the first half of
// Mirrored bytes from the old_ctrl() will also be copied. // the cloned bytes. This effectively shuffles the control bytes.
// In case of old_capacity_ == 3, we will copy 1st element twice. uint64_t copied_bytes = 0;
copied_bytes =
absl::little_endian::Load64(old_ctrl() + half_old_capacity + 1);
// We change the sentinel byte to kEmpty before storing to both the start of
// the new_ctrl, and past the end of the new_ctrl later for the new cloned
// bytes. Note that this is faster than setting the sentinel byte to kEmpty
// after the copy directly in new_ctrl because we are limited on store
// bandwidth.
constexpr uint64_t kEmptyXorSentinel =
static_cast<uint8_t>(ctrl_t::kEmpty) ^
static_cast<uint8_t>(ctrl_t::kSentinel);
const uint64_t mask_convert_old_sentinel_to_empty =
kEmptyXorSentinel << (half_old_capacity * 8);
copied_bytes ^= mask_convert_old_sentinel_to_empty;
// Copy second half of bytes to the beginning. This correctly sets the bytes
// [0, old_capacity]. We potentially copy more bytes in order to have compile
// time known size. Mirrored bytes from the old_ctrl() will also be copied. In
// case of old_capacity_ == 3, we will copy 1st element twice.
// Examples: // Examples:
// (old capacity = 1)
// old_ctrl = 0S0EEEEEEE... // old_ctrl = 0S0EEEEEEE...
// new_ctrl = S0EEEEEEEE... // new_ctrl = E0EEEEEE??...
// //
// old_ctrl = 01S01EEEEE... // (old capacity = 3)
// new_ctrl = 1S01EEEEEE... // old_ctrl = 012S012EEEEE...
// new_ctrl = 12E012EE????...
// //
// (old capacity = 7)
// old_ctrl = 0123456S0123456EE... // old_ctrl = 0123456S0123456EE...
// new_ctrl = 456S0123?????????... // new_ctrl = 456E0123?????????...
std::memcpy(new_ctrl, old_ctrl() + half_old_capacity + 1, kHalfWidth); absl::little_endian::Store64(new_ctrl, copied_bytes);
// Clean up copied kSentinel from old_ctrl.
new_ctrl[half_old_capacity] = ctrl_t::kEmpty; // Set the space [old_capacity + 1, new_capacity] to empty as these bytes will
// not be written again. This is safe because
// Clean up damaged or uninitialized bytes. // NumControlBytes = new_capacity + kWidth and new_capacity >=
// old_capacity+1.
// Clean bytes after the intended size of the copy.
// Example:
// new_ctrl = 1E01EEEEEEE????
// *new_ctrl= 1E0EEEEEEEE????
// position /
std::memset(new_ctrl + old_capacity_ + 1, static_cast<int8_t>(ctrl_t::kEmpty),
kHalfWidth);
// Clean non-mirrored bytes that are not initialized.
// For small old_capacity that may be inside of mirrored bytes zone.
// Examples: // Examples:
// new_ctrl = 1E0EEEEEEEE??????????.... // (old_capacity = 3, new_capacity = 15)
// *new_ctrl= 1E0EEEEEEEEEEEEE?????.... // new_ctrl = 12E012EE?????????????...??
// position / // *new_ctrl = 12E0EEEEEEEEEEEEEEEE?...??
// position / S
// //
// new_ctrl = 456E0123???????????... // (old_capacity = 7, new_capacity = 15)
// *new_ctrl= 456E0123EEEEEEEE???... // new_ctrl = 456E0123?????????????????...??
// position / // *new_ctrl = 456E0123EEEEEEEEEEEEEEEE?...??
std::memset(new_ctrl + kHalfWidth, static_cast<int8_t>(ctrl_t::kEmpty), // position / S
kHalfWidth); std::memset(new_ctrl + old_capacity_ + 1, static_cast<int8_t>(ctrl_t::kEmpty),
// Clean last mirrored bytes that are not initialized Group::kWidth);
// and will not be overwritten by mirroring.
// Set the last kHalfWidth bytes to empty, to ensure the bytes all the way to
// the end are initialized.
// Examples: // Examples:
// new_ctrl = 1E0EEEEEEEEEEEEE???????? // new_ctrl = 12E0EEEEEEEEEEEEEEEE?...???????
// *new_ctrl= 1E0EEEEEEEEEEEEEEEEEEEEE // *new_ctrl = 12E0EEEEEEEEEEEEEEEE???EEEEEEEE
// position S / // position S /
// //
// new_ctrl = 456E0123EEEEEEEE??????????????? // new_ctrl = 456E0123EEEEEEEEEEEEEEEE???????
// *new_ctrl= 456E0123EEEEEEEE???????EEEEEEEE // *new_ctrl = 456E0123EEEEEEEEEEEEEEEEEEEEEEE
// position S / // position S /
std::memset(new_ctrl + new_capacity + kHalfWidth, std::memset(new_ctrl + NumControlBytes(new_capacity) - kHalfWidth,
static_cast<int8_t>(ctrl_t::kEmpty), kHalfWidth); static_cast<int8_t>(ctrl_t::kEmpty), kHalfWidth);
// Create mirrored bytes. old_capacity_ < kHalfWidth // Copy the first bytes to the end (starting at new_capacity +1) to set the
// Example: // cloned bytes. Note that we use the already copied bytes from old_ctrl here
// new_ctrl = 456E0123EEEEEEEE???????EEEEEEEE // rather than copying from new_ctrl to avoid a Read-after-Write hazard, since
// *new_ctrl= 456E0123EEEEEEEE456E0123EEEEEEE // new_ctrl was just written to. The first old_capacity-1 bytes are set
// position S/ // correctly. Then there may be up to old_capacity bytes that need to be
ctrl_t g[kHalfWidth]; // overwritten, and any remaining bytes will be correctly set to empty. This
std::memcpy(g, new_ctrl, kHalfWidth); // sets [new_capacity + 1, new_capacity +1 + old_capacity] correctly.
std::memcpy(new_ctrl + new_capacity + 1, g, kHalfWidth); // Examples:
// new_ctrl = 12E0EEEEEEEEEEEEEEEE?...???????
// *new_ctrl = 12E0EEEEEEEEEEEE12E012EEEEEEEEE
// position S/
//
// new_ctrl = 456E0123EEEEEEEE?...???EEEEEEEE
// *new_ctrl = 456E0123EEEEEEEE456E0123EEEEEEE
// position S/
absl::little_endian::Store64(new_ctrl + new_capacity + 1, copied_bytes);
// Set The remaining bytes at the end past the cloned bytes to empty. The
// incorrectly set bytes are [new_capacity + old_capacity + 2,
// min(new_capacity + 1 + kHalfWidth, new_capacity + old_capacity + 2 +
// half_old_capacity)]. Taking the difference, we need to set min(kHalfWidth -
// (old_capacity + 1), half_old_capacity)]. Since old_capacity < kHalfWidth,
// half_old_capacity < kQuarterWidth, so we set kQuarterWidth beginning at
// new_capacity + old_capacity + 2 to kEmpty.
// Examples:
// new_ctrl = 12E0EEEEEEEEEEEE12E012EEEEEEEEE
// *new_ctrl = 12E0EEEEEEEEEEEE12E0EEEEEEEEEEE
// position S /
//
// new_ctrl = 456E0123EEEEEEEE456E0123EEEEEEE
// *new_ctrl = 456E0123EEEEEEEE456E0123EEEEEEE (no change)
// position S /
std::memset(new_ctrl + new_capacity + old_capacity_ + 2,
static_cast<int8_t>(ctrl_t::kEmpty), kQuarterWidth);
// Finally set sentinel to its place. // Finally, we set the new sentinel byte.
new_ctrl[new_capacity] = ctrl_t::kSentinel; new_ctrl[new_capacity] = ctrl_t::kSentinel;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment