Commit 7383f346 by Abseil Team Committed by Copybara-Service

Optimize SwissMap iteration by another 5-10% for ARM

https://pastebin.com/fDvgWgHe

After having a chat with Dougall Johnson (https://twitter.com/dougallj/status/1534213050944802816), we realized that __clzll works with zero arguments per documentation:

https://developer.arm.com/documentation/101028/0009/Data-processing-intrinsics

```
Returns the number of leading zero bits in x. When x is zero it returns the argument width, i.e. 32 or 64.
```

Codegen improves https://godbolt.org/z/ebadf717Y

Thus we can use a little bit different construction not involving CLS but using more understandable CLZ and removing some operations.

PiperOrigin-RevId: 453879080
Change-Id: Ie2d7f834f63364d7bd50dd6a682c107985f21942
parent e0a32c2a
......@@ -860,21 +860,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
#define ABSL_INTERNAL_HAVE_SSSE3 1
#endif
// ABSL_INTERNAL_HAVE_ARM_ACLE is used for compile-time detection of ACLE (ARM
// C language extensions).
#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
#error ABSL_INTERNAL_HAVE_ARM_ACLE cannot be directly set
// __cls, __rbit were added quite late in clang. They are not supported
// by GCC as well. __cls can be replaced with __builtin_clrsb but clang does
// not recognize cls instruction in latest versions.
// TODO(b/233604649): Relax to __builtin_clrsb and __builtin_bitreverse64 (note
// that the latter is not supported by GCC).
#elif defined(__ARM_ACLE) && defined(__clang__) && \
ABSL_HAVE_BUILTIN(__builtin_arm_cls64) && \
ABSL_HAVE_BUILTIN(__builtin_arm_rbit64)
#define ABSL_INTERNAL_HAVE_ARM_ACLE 1
#endif
// ABSL_INTERNAL_HAVE_ARM_NEON is used for compile-time detection of NEON (ARM
// SIMD).
#ifdef ABSL_INTERNAL_HAVE_ARM_NEON
......
......@@ -188,10 +188,6 @@
#include <arm_neon.h>
#endif
#ifdef __ARM_ACLE
#include <arm_acle.h>
#endif
#include <algorithm>
#include <cmath>
#include <cstdint>
......@@ -634,29 +630,12 @@ struct GroupAArch64Impl {
uint32_t CountLeadingEmptyOrDeleted() const {
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(ctrl), 0);
assert(IsEmptyOrDeleted(static_cast<ctrl_t>(mask & 0xff)));
constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL;
#if defined(ABSL_INTERNAL_HAVE_ARM_ACLE)
// cls: Count leading sign bits.
// clsll(1ull << 63) -> 0
// clsll((1ull << 63) | (1ull << 62)) -> 1
// clsll((1ull << 63) | (1ull << 61)) -> 0
// clsll(~0ull) -> 63
// clsll(1) -> 62
// clsll(3) -> 61
// clsll(5) -> 60
// Note that CountLeadingEmptyOrDeleted is called when first control block
// is kDeleted or kEmpty. The implementation is similar to GroupPortableImpl
// but avoids +1 and __clsll returns result not including the high bit. Thus
// saves one cycle.
// kEmpty = -128, // 0b10000000
// kDeleted = -2, // 0b11111110
// ~ctrl & (ctrl >> 7) will have the lowest bit set to 1. After rbit,
// it will the highest one.
return (__clsll(__rbitll((~mask & (mask >> 7)) | gaps)) + 8) >> 3;
#else
return (TrailingZeros(((~mask & (mask >> 7)) | gaps) + 1) + 7) >> 3;
#endif // ABSL_INTERNAL_HAVE_ARM_ACLE
// ctrl | ~(ctrl >> 7) will have the lowest bit set to zero for kEmpty and
// kDeleted. We lower all other bits and count number of trailing zeros.
// Clang and GCC optimize countr_zero to rbit+clz without any check for 0,
// so we should be fine.
constexpr uint64_t bits = 0x0101010101010101ULL;
return countr_zero((mask | ~(mask >> 7)) & bits) >> 3;
}
void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const {
......@@ -711,8 +690,10 @@ struct GroupPortableImpl {
}
uint32_t CountLeadingEmptyOrDeleted() const {
constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL;
return (TrailingZeros(((~ctrl & (ctrl >> 7)) | gaps) + 1) + 7) >> 3;
// ctrl | ~(ctrl >> 7) will have the lowest bit set to zero for kEmpty and
// kDeleted. We lower all other bits and count number of trailing zeros.
constexpr uint64_t bits = 0x0101010101010101ULL;
return countr_zero((ctrl | ~(ctrl >> 7)) & bits) >> 3;
}
void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const {
......
......@@ -266,20 +266,16 @@ TEST(Group, CountLeadingEmptyOrDeleted) {
for (ctrl_t empty : empty_examples) {
std::vector<ctrl_t> e(Group::kWidth, empty);
EXPECT_TRUE(IsEmptyOrDeleted(e[0]));
EXPECT_EQ(Group::kWidth, Group{e.data()}.CountLeadingEmptyOrDeleted());
for (ctrl_t full : full_examples) {
// First is always kEmpty or kDeleted.
for (size_t i = 1; i != Group::kWidth; ++i) {
for (size_t i = 0; i != Group::kWidth; ++i) {
std::vector<ctrl_t> f(Group::kWidth, empty);
f[i] = full;
EXPECT_TRUE(IsEmptyOrDeleted(f[0]));
EXPECT_EQ(i, Group{f.data()}.CountLeadingEmptyOrDeleted());
}
std::vector<ctrl_t> f(Group::kWidth, empty);
f[Group::kWidth * 2 / 3] = full;
f[Group::kWidth / 2] = full;
EXPECT_TRUE(IsEmptyOrDeleted(f[0]));
EXPECT_EQ(
Group::kWidth / 2, Group{f.data()}.CountLeadingEmptyOrDeleted());
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment