Commit 4cb6c389 by Ilya Tokar Committed by Copybara-Service

Add prefetch to crc32

We already prefetch in case of large inputs, do the same
for medium sized inputs as well. This is mostly neutral
for performance in most cases, so this also adds a new
bench with working size >> cache size to ensure that we
are seeing performance benefits of prefetch. Main benefits
are on AMD with hardware prefetchers turned off:

AMD prefetchers on:
name                           old time/op  new time/op  delta
BM_Calculate/0                 2.43ns ± 1%  2.43ns ± 1%     ~     (p=0.814 n=40+40)
BM_Calculate/1                 2.50ns ± 2%  2.50ns ± 2%     ~     (p=0.745 n=39+39)
BM_Calculate/100               9.17ns ± 1%  9.17ns ± 2%     ~     (p=0.747 n=40+40)
BM_Calculate/10000              474ns ± 1%   474ns ± 2%     ~     (p=0.749 n=40+40)
BM_Calculate/500000            22.8µs ± 1%  22.9µs ± 2%     ~     (p=0.298 n=39+40)
BM_Extend/0                    1.38ns ± 1%  1.38ns ± 1%     ~     (p=0.651 n=40+40)
BM_Extend/1                    1.53ns ± 2%  1.53ns ± 1%     ~     (p=0.957 n=40+39)
BM_Extend/100                  9.48ns ± 1%  9.48ns ± 2%     ~     (p=1.000 n=40+40)
BM_Extend/10000                 474ns ± 2%   474ns ± 1%     ~     (p=0.928 n=40+40)
BM_Extend/500000               22.8µs ± 1%  22.9µs ± 2%     ~     (p=0.331 n=40+40)
BM_Extend/100000000            4.79ms ± 1%  4.79ms ± 1%     ~     (p=0.753 n=38+38)
BM_ExtendCacheMiss/10          25.5ms ± 2%  25.5ms ± 2%     ~     (p=0.988 n=38+40)
BM_ExtendCacheMiss/100         23.1ms ± 2%  23.1ms ± 2%     ~     (p=0.792 n=40+40)
BM_ExtendCacheMiss/1000        37.2ms ± 1%  28.6ms ± 2%  -23.00%  (p=0.000 n=38+40)
BM_ExtendCacheMiss/100000      7.77ms ± 2%  7.74ms ± 2%   -0.45%  (p=0.006 n=40+40)

AMD prefetchers off:
name                           old time/op  new time/op  delta
BM_Calculate/0                 2.43ns ± 2%  2.43ns ± 2%     ~     (p=0.351 n=40+39)
BM_Calculate/1                 2.51ns ± 2%  2.51ns ± 1%     ~     (p=0.535 n=40+40)
BM_Calculate/100               9.18ns ± 2%  9.15ns ± 2%     ~     (p=0.120 n=38+39)
BM_Calculate/10000              475ns ± 2%   475ns ± 2%     ~     (p=0.852 n=40+40)
BM_Calculate/500000            22.9µs ± 2%  22.8µs ± 2%     ~     (p=0.396 n=40+40)
BM_Extend/0                    1.38ns ± 2%  1.38ns ± 2%     ~     (p=0.466 n=40+40)
BM_Extend/1                    1.53ns ± 2%  1.53ns ± 2%     ~     (p=0.914 n=40+39)
BM_Extend/100                  9.49ns ± 2%  9.49ns ± 2%     ~     (p=0.802 n=40+40)
BM_Extend/10000                 475ns ± 2%   474ns ± 1%     ~     (p=0.589 n=40+40)
BM_Extend/500000               22.8µs ± 2%  22.8µs ± 2%     ~     (p=0.872 n=39+40)
BM_Extend/100000000            10.0ms ± 3%  10.0ms ± 4%     ~     (p=0.355 n=40+40)
BM_ExtendCacheMiss/10           196ms ± 2%   196ms ± 2%     ~     (p=0.698 n=40+40)
BM_ExtendCacheMiss/100          129ms ± 1%   129ms ± 1%     ~     (p=0.602 n=36+37)
BM_ExtendCacheMiss/1000        88.6ms ± 1%  57.2ms ± 1%  -35.49%  (p=0.000 n=36+38)
BM_ExtendCacheMiss/100000      14.9ms ± 1%  14.9ms ± 1%     ~     (p=0.888 n=39+40)

Intel skylake:
BM_Calculate/0                 2.49ns ± 2%  2.44ns ± 4%  -2.15%  (p=0.001 n=31+34)
BM_Calculate/1                 3.04ns ± 2%  2.98ns ± 9%  -1.95%  (p=0.003 n=31+35)
BM_Calculate/100               8.64ns ± 3%  8.53ns ± 5%    ~     (p=0.065 n=31+35)
BM_Calculate/10000              290ns ± 3%   285ns ± 7%  -1.80%  (p=0.004 n=28+34)
BM_Calculate/500000            11.8µs ± 2%  11.6µs ± 8%  -1.59%  (p=0.003 n=26+34)
BM_Extend/0                    1.56ns ± 1%  1.52ns ± 3%  -2.44%  (p=0.000 n=26+35)
BM_Extend/1                    1.88ns ± 3%  1.83ns ± 6%  -2.17%  (p=0.001 n=27+35)
BM_Extend/100                  9.31ns ± 3%  9.13ns ± 7%  -1.92%  (p=0.000 n=33+38)
BM_Extend/10000                 290ns ± 3%   283ns ± 3%  -2.45%  (p=0.000 n=32+38)
BM_Extend/500000               11.8µs ± 2%  11.5µs ± 8%  -1.80%  (p=0.001 n=35+37)
BM_Extend/100000000            6.39ms ±10%  6.11ms ± 8%  -4.34%  (p=0.000 n=40+40)
BM_ExtendCacheMiss/10          36.2ms ± 7%  35.8ms ±14%    ~     (p=0.281 n=33+37)
BM_ExtendCacheMiss/100         26.9ms ±15%  25.9ms ±12%  -3.93%  (p=0.000 n=40+40)
BM_ExtendCacheMiss/1000        23.8ms ± 5%  23.4ms ± 5%  -1.68%  (p=0.001 n=39+40)
BM_ExtendCacheMiss/100000      10.1ms ± 5%  10.0ms ± 4%    ~     (p=0.051 n=39+39)

PiperOrigin-RevId: 495119444
Change-Id: I67bcf3b0282b5e1c43122de2837a24c16b8aded7
parent 1887dece
...@@ -204,6 +204,7 @@ cc_binary( ...@@ -204,6 +204,7 @@ cc_binary(
deps = [ deps = [
":crc32c", ":crc32c",
"//absl/memory", "//absl/memory",
"//absl/strings",
"@com_github_google_benchmark//:benchmark_main", "@com_github_google_benchmark//:benchmark_main",
], ],
) )
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32c.h" #include "absl/crc/internal/crc32c.h"
#include "absl/memory/memory.h" #include "absl/memory/memory.h"
#include "absl/strings/string_view.h"
#include "benchmark/benchmark.h" #include "benchmark/benchmark.h"
namespace { namespace {
...@@ -52,7 +53,27 @@ void BM_Extend(benchmark::State& state) { ...@@ -52,7 +53,27 @@ void BM_Extend(benchmark::State& state) {
benchmark::DoNotOptimize(crc); benchmark::DoNotOptimize(crc);
} }
} }
BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000); BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000)->Arg(
100 * 1000 * 1000);
// Make working set >> CPU cache size to benchmark prefetches better
void BM_ExtendCacheMiss(benchmark::State& state) {
int len = state.range(0);
constexpr int total = 300 * 1000 * 1000;
std::string extension = TestString(total);
absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World"
for (auto s : state) {
for (int i = 0; i < total; i += len * 2) {
benchmark::DoNotOptimize(base);
benchmark::DoNotOptimize(extension);
absl::crc32c_t crc =
absl::ExtendCrc32c(base, absl::string_view(&extension[i], len));
benchmark::DoNotOptimize(crc);
}
}
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * total / 2);
}
BENCHMARK(BM_ExtendCacheMiss)->Arg(10)->Arg(100)->Arg(1000)->Arg(100000);
void BM_ExtendByZeroes(benchmark::State& state) { void BM_ExtendByZeroes(benchmark::State& state) {
absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World" absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World"
......
...@@ -29,6 +29,8 @@ namespace crc_internal { ...@@ -29,6 +29,8 @@ namespace crc_internal {
// Prefetch constants used in some Extend() implementations // Prefetch constants used in some Extend() implementations
constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4; // Prefetch this far constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4; // Prefetch this far
// Shorter prefetch distance for smaller buffers
constexpr int kPrefetchHorizonMedium = ABSL_CACHELINE_SIZE * 1;
static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len"); static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len");
// We require the Scramble() function: // We require the Scramble() function:
......
...@@ -429,6 +429,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ...@@ -429,6 +429,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
} }
// Don't run crc on last 8 bytes. // Don't run crc on last 8 bytes.
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment