Commit 64814435 by Abseil Team Committed by Copybara-Service

Optimize SwissMap for ARM by 3-8% for all operations

https://pastebin.com/CmnzwUFN

The key idea is to avoid using 16 byte NEON and use 8 byte NEON which has lower latency for BitMask::Match. Even though 16 byte NEON achieves higher throughput, in SwissMap it's very important to catch these Matches with low latency as probing on average happens at most once.

I also introduced NonIterableMask as ARM has really great cbnz instructions and additional AND on scalar mask had 1 extra latency cycle

PiperOrigin-RevId: 453216147
Change-Id: I842c50d323954f8383ae156491232ced55aacb78
parent 48419595
......@@ -898,4 +898,13 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
#define ABSL_INTERNAL_HAVE_ARM_ACLE 1
#endif
// ABSL_INTERNAL_HAVE_ARM_NEON is used for compile-time detection of NEON (ARM
// SIMD).
#ifdef ABSL_INTERNAL_HAVE_ARM_NEON
#error ABSL_INTERNAL_HAVE_ARM_NEON cannot be directly set
#elif defined(__ARM_NEON)
#define ABSL_INTERNAL_HAVE_ARM_NEON 1
#endif
#endif // ABSL_BASE_CONFIG_H_
......@@ -336,27 +336,27 @@ void BM_Group_Match(benchmark::State& state) {
}
BENCHMARK(BM_Group_Match);
void BM_Group_MatchEmpty(benchmark::State& state) {
void BM_Group_MaskEmpty(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group;
Iota(group.begin(), group.end(), -4);
Group g{group.data()};
for (auto _ : state) {
::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(g.MatchEmpty());
::benchmark::DoNotOptimize(g.MaskEmpty());
}
}
BENCHMARK(BM_Group_MatchEmpty);
BENCHMARK(BM_Group_MaskEmpty);
void BM_Group_MatchEmptyOrDeleted(benchmark::State& state) {
void BM_Group_MaskEmptyOrDeleted(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group;
Iota(group.begin(), group.end(), -4);
Group g{group.data()};
for (auto _ : state) {
::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(g.MatchEmptyOrDeleted());
::benchmark::DoNotOptimize(g.MaskEmptyOrDeleted());
}
}
BENCHMARK(BM_Group_MatchEmptyOrDeleted);
BENCHMARK(BM_Group_MaskEmptyOrDeleted);
void BM_Group_CountLeadingEmptyOrDeleted(benchmark::State& state) {
std::array<ctrl_t, Group::kWidth> group;
......@@ -375,7 +375,7 @@ void BM_Group_MatchFirstEmptyOrDeleted(benchmark::State& state) {
Group g{group.data()};
for (auto _ : state) {
::benchmark::DoNotOptimize(g);
::benchmark::DoNotOptimize(*g.MatchEmptyOrDeleted());
::benchmark::DoNotOptimize(g.MaskEmptyOrDeleted().LowestBitSet());
}
}
BENCHMARK(BM_Group_MatchFirstEmptyOrDeleted);
......
......@@ -195,35 +195,39 @@ TEST(Group, Match) {
}
}
TEST(Group, MatchEmpty) {
TEST(Group, MaskEmpty) {
if (Group::kWidth == 16) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kDeleted, CtrlT(3),
ctrl_t::kEmpty, CtrlT(5), ctrl_t::kSentinel, CtrlT(7),
CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1),
CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0, 4));
EXPECT_THAT(Group{group}.MaskEmpty().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmpty().HighestBitSet(), 4);
} else if (Group::kWidth == 8) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2),
ctrl_t::kDeleted, CtrlT(2), CtrlT(1),
ctrl_t::kSentinel, CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0));
EXPECT_THAT(Group{group}.MaskEmpty().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmpty().HighestBitSet(), 0);
} else {
FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth;
}
}
TEST(Group, MatchEmptyOrDeleted) {
TEST(Group, MaskEmptyOrDeleted) {
if (Group::kWidth == 16) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kDeleted, CtrlT(3),
ctrl_t::kEmpty, CtrlT(5), ctrl_t::kSentinel, CtrlT(7),
CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1),
CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 2, 4));
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), ctrl_t::kEmpty, CtrlT(3),
ctrl_t::kDeleted, CtrlT(5), ctrl_t::kSentinel, CtrlT(7),
CtrlT(7), CtrlT(5), CtrlT(3), CtrlT(1),
CtrlT(1), CtrlT(1), CtrlT(1), CtrlT(1)};
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().HighestBitSet(), 4);
} else if (Group::kWidth == 8) {
ctrl_t group[] = {ctrl_t::kEmpty, CtrlT(1), CtrlT(2),
ctrl_t::kDeleted, CtrlT(2), CtrlT(1),
ctrl_t::kSentinel, CtrlT(1)};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 3));
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().LowestBitSet(), 0);
EXPECT_THAT(Group{group}.MaskEmptyOrDeleted().HighestBitSet(), 3);
} else {
FAIL() << "No test coverage for Group::kWidth==" << Group::kWidth;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment