Commit 9fb8a388 by Abseil Team Committed by Copybara-Service

The current implementation of control by checking on x86 has an unnecessary sign…

The current implementation of control by checking on x86 has an unnecessary sign extension after the doing the control byte comparison. Changing the bitmask object
 to explicitly track only 16 bits (instead of 32) eliminates this, saving an instruction / cycle. This speeds up hit checking by up to 6% on Milan and up to 15% on CLX

PiperOrigin-RevId: 572965182
Change-Id: Ifda0e3250d409266d6dcef89cba6ada91d879291
parent 5a7fca7d
...@@ -408,7 +408,9 @@ class NonIterableBitMask { ...@@ -408,7 +408,9 @@ class NonIterableBitMask {
uint32_t LeadingZeros() const { uint32_t LeadingZeros() const {
constexpr int total_significant_bits = SignificantBits << Shift; constexpr int total_significant_bits = SignificantBits << Shift;
constexpr int extra_bits = sizeof(T) * 8 - total_significant_bits; constexpr int extra_bits = sizeof(T) * 8 - total_significant_bits;
return static_cast<uint32_t>(countl_zero(mask_ << extra_bits)) >> Shift; return static_cast<uint32_t>(
countl_zero(static_cast<T>(mask_ << extra_bits))) >>
Shift;
} }
T mask_; T mask_;
...@@ -614,29 +616,31 @@ struct GroupSse2Impl { ...@@ -614,29 +616,31 @@ struct GroupSse2Impl {
} }
// Returns a bitmask representing the positions of slots that match hash. // Returns a bitmask representing the positions of slots that match hash.
BitMask<uint32_t, kWidth> Match(h2_t hash) const { BitMask<uint16_t, kWidth> Match(h2_t hash) const {
auto match = _mm_set1_epi8(static_cast<char>(hash)); auto match = _mm_set1_epi8(static_cast<char>(hash));
return BitMask<uint32_t, kWidth>( BitMask<uint16_t, kWidth> result = BitMask<uint16_t, kWidth>(0);
static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)))); result = BitMask<uint16_t, kWidth>(
static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
return result;
} }
// Returns a bitmask representing the positions of empty slots. // Returns a bitmask representing the positions of empty slots.
NonIterableBitMask<uint32_t, kWidth> MaskEmpty() const { NonIterableBitMask<uint16_t, kWidth> MaskEmpty() const {
#ifdef ABSL_INTERNAL_HAVE_SSSE3 #ifdef ABSL_INTERNAL_HAVE_SSSE3
// This only works because ctrl_t::kEmpty is -128. // This only works because ctrl_t::kEmpty is -128.
return NonIterableBitMask<uint32_t, kWidth>( return NonIterableBitMask<uint16_t, kWidth>(
static_cast<uint32_t>(_mm_movemask_epi8(_mm_sign_epi8(ctrl, ctrl)))); static_cast<uint16_t>(_mm_movemask_epi8(_mm_sign_epi8(ctrl, ctrl))));
#else #else
auto match = _mm_set1_epi8(static_cast<char>(ctrl_t::kEmpty)); auto match = _mm_set1_epi8(static_cast<char>(ctrl_t::kEmpty));
return NonIterableBitMask<uint32_t, kWidth>( return NonIterableBitMask<uint16_t, kWidth>(
static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)))); static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
#endif #endif
} }
// Returns a bitmask representing the positions of empty or deleted slots. // Returns a bitmask representing the positions of empty or deleted slots.
NonIterableBitMask<uint32_t, kWidth> MaskEmptyOrDeleted() const { NonIterableBitMask<uint16_t, kWidth> MaskEmptyOrDeleted() const {
auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel)); auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel));
return NonIterableBitMask<uint32_t, kWidth>(static_cast<uint32_t>( return NonIterableBitMask<uint16_t, kWidth>(static_cast<uint16_t>(
_mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl)))); _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl))));
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment