Commit 64814435 by Abseil Team Committed by Copybara-Service

Optimize SwissMap for ARM by 3-8% for all operations

https://pastebin.com/CmnzwUFN

The key idea is to avoid using 16 byte NEON and use 8 byte NEON which has lower latency for BitMask::Match. Even though 16 byte NEON achieves higher throughput, in SwissMap it's very important to catch these Matches with low latency as probing on average happens at most once.

I also introduced NonIterableMask as ARM has really great cbnz instructions and additional AND on scalar mask had 1 extra latency cycle

PiperOrigin-RevId: 453216147
Change-Id: I842c50d323954f8383ae156491232ced55aacb78
parent 48419595
...@@ -898,4 +898,13 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || ...@@ -898,4 +898,13 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
#define ABSL_INTERNAL_HAVE_ARM_ACLE 1 #define ABSL_INTERNAL_HAVE_ARM_ACLE 1
#endif #endif
// ABSL_INTERNAL_HAVE_ARM_NEON is used for compile-time detection of NEON (ARM
// SIMD).
#ifdef ABSL_INTERNAL_HAVE_ARM_NEON
#error ABSL_INTERNAL_HAVE_ARM_NEON cannot be directly set
#elif defined(__ARM_NEON)
#define ABSL_INTERNAL_HAVE_ARM_NEON 1
#endif
#endif // ABSL_BASE_CONFIG_H_ #endif // ABSL_BASE_CONFIG_H_
...@@ -336,27 +336,27 @@ void BM_Group_Match(benchmark::State& state) { ...@@ -336,27 +336,27 @@ void BM_Group_Match(benchmark::State& state) {
} }
BENCHMARK(BM_Group_Match); BENCHMARK(BM_Group_Match);
void BM_Group_MatchEmpty(benchmark::State& state) { void BM_Group_MaskEmpty(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group; std::array<ctrl_t, Group::kWidth> group;
Iota(group.begin(), group.end(), -4); Iota(group.begin(), group.end(), -4);
Group g{group.data()}; Group g{group.data()};
for (auto _ : state) { for (auto _ : state) {
::benchmark::DoNotOptimize(g); ::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(g.MatchEmpty()); ::benchmark::DoNotOptimize(g.MaskEmpty());
} }
} }
BENCHMARK(BM_Group_MatchEmpty); BENCHMARK(BM_Group_MaskEmpty);
void BM_Group_MatchEmptyOrDeleted(benchmark::State& state) { void BM_Group_MaskEmptyOrDeleted(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group; std::array<ctrl_t, Group::kWidth> group;
Iota(group.begin(), group.end(), -4); Iota(group.begin(), group.end(), -4);
Group g{group.data()}; Group g{group.data()};
for (auto _ : state) { for (auto _ : state) {
::benchmark::DoNotOptimize(g); ::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(g.MatchEmptyOrDeleted()); ::benchmark::DoNotOptimize(g.MaskEmptyOrDeleted());
} }
} }
BENCHMARK(BM_Group_MatchEmptyOrDeleted); BENCHMARK(BM_Group_MaskEmptyOrDeleted);
void BM_Group_CountLeadingEmptyOrDeleted(benchmark::State& state) { void BM_Group_CountLeadingEmptyOrDeleted(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group; std::array<ctrl_t, Group::kWidth> group;
...@@ -375,7 +375,7 @@ void BM_Group_MatchFirstEmptyOrDeleted(benchmark::State& state) { ...@@ -375,7 +375,7 @@ void BM_Group_MatchFirstEmptyOrDeleted(benchmark::State& state) {
Group g{group.data()}; Group g{group.data()};
for (auto _ : state) { for (auto _ : state) {
::benchmark::DoNotOptimize(g); ::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(*g.MatchEmptyOrDeleted()); ::benchmark::DoNotOptimize(g.MaskEmptyOrDeleted().LowestBitSet());
} }
} }
BENCHMARK(BM_Group_MatchFirstEmptyOrDeleted); BENCHMARK(BM_Group_MatchFirstEmptyOrDeleted);
......
...@@ -195,35 +195,39 @@ TEST(Group, Match) { ...@@ -195,35 +195,39 @@ TEST(Group, Match) {
} }
} }
TEST(Group, MatchEmpty) { TEST(Group, MaskEmpty) {
if (Group::kWidth == 16) { if (Group::kWidth == 16) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kDeleted, CtrlT(3), ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kDeleted, CtrlT(3),
ctrl_t::kEmpty, CtrlT(5), ctrl_t::kSentinel, CtrlT(7), ctrl_t::kEmpty, CtrlT(5), ctrl_t::kSentinel, CtrlT(7),
CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1), CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1),
CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)}; CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0, 4)); EXPECT_THAT(Group{group}.MaskEmpty().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmpty().HighestBitSet(), 4);
} else if (Group::kWidth == 8) { } else if (Group::kWidth == 8) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2), ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2),
ctrl_t::kDeleted, CtrlT(2), CtrlT(1), ctrl_t::kDeleted, CtrlT(2), CtrlT(1),
ctrl_t::kSentinel, CtrlT(1)}; ctrl_t::kSentinel, CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0)); EXPECT_THAT(Group{group}.MaskEmpty().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmpty().HighestBitSet(), 0);
} else { } else {
FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth; FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth;
} }
} }
TEST(Group, MatchEmptyOrDeleted) { TEST(Group, MaskEmptyOrDeleted) {
if (Group::kWidth == 16) { if (Group::kWidth == 16) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kDeleted, CtrlT(3), ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kEmpty, CtrlT(3),
ctrl_t::kEmpty, CtrlT(5), ctrl_t::kSentinel, CtrlT(7), ctrl_t::kDeleted, CtrlT(5), ctrl_t::kSentinel, CtrlT(7),
CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1), CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1),
CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)}; CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 2, 4)); EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().HighestBitSet(), 4);
} else if (Group::kWidth == 8) { } else if (Group::kWidth == 8) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2), ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2),
ctrl_t::kDeleted, CtrlT(2), CtrlT(1), ctrl_t::kDeleted, CtrlT(2), CtrlT(1),
ctrl_t::kSentinel, CtrlT(1)}; ctrl_t::kSentinel, CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 3)); EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().HighestBitSet(), 3);
} else { } else {
FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth; FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment