Commit 66665d8d by Derek Mauro Committed by Copybara-Service

Fixes many compilation issues that come from having no external CI

coverage of the accelerated CRC implementation and some differences
bewteen the internal and external implementation.

This change adds CI coverage to the
linux_clang-latest_libstdcxx_bazel.sh script assuming this script
always runs on machines of at least the Intel Haswell generation.

Fixes include:
 * Remove the use of the deprecated xor operator on crc32c_t
 * Remove #pragma unroll_completely, which isn't known by GCC or Clang:
   https://godbolt.org/z/97j4vbacs
 * Fixes for -Wsign-compare, -Wsign-conversion and -Wshorten-64-to-32

PiperOrigin-RevId: 491965029
Change-Id: Ic5e1f3a20f69fcd35fe81ebef63443ad26bf7931
parent 94e9ee3f
......@@ -15,6 +15,7 @@
#include "absl/crc/crc32c.h"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <string>
......@@ -90,7 +91,8 @@ TEST(CRC32C, ExtendByZeroes) {
std::string base = "hello world";
absl::crc32c_t base_crc = absl::crc32c_t{0xc99465aa};
for (const size_t extend_by : {100, 10000, 100000}) {
constexpr size_t kExtendByValues[] = {100, 10000, 100000};
for (const size_t extend_by : kExtendByValues) {
SCOPED_TRACE(extend_by);
absl::crc32c_t crc2 = absl::ExtendCrc32cByZeroes(base_crc, extend_by);
EXPECT_EQ(crc2, absl::ComputeCrc32c(base + std::string(extend_by, '\0')));
......@@ -98,10 +100,13 @@ TEST(CRC32C, ExtendByZeroes) {
}
TEST(CRC32C, UnextendByZeroes) {
constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000};
constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000};
for (auto seed_crc : {absl::crc32c_t{0}, absl::crc32c_t{0xc99465aa}}) {
SCOPED_TRACE(seed_crc);
for (const size_t size_1 : {2, 200, 20000, 200000, 20000000}) {
for (const size_t size_2 : {0, 100, 10000, 100000, 10000000}) {
for (const size_t size_1 : kExtendByValues) {
for (const size_t size_2 : kUnextendByValues) {
size_t extend_size = std::max(size_1, size_2);
size_t unextend_size = std::min(size_1, size_2);
SCOPED_TRACE(extend_size);
......@@ -120,7 +125,9 @@ TEST(CRC32C, UnextendByZeroes) {
}
}
}
for (const size_t size : {0, 1, 100, 10000}) {
constexpr size_t kSizes[] = {0, 1, 100, 10000};
for (const size_t size : kSizes) {
SCOPED_TRACE(size);
std::string string_before = TestString(size);
std::string string_after = string_before + std::string(size, '\0');
......@@ -146,7 +153,8 @@ TEST(CRC32C, Concat) {
}
TEST(CRC32C, Memcpy) {
for (size_t bytes : {0, 1, 20, 500, 100000}) {
constexpr size_t kBytesSize[] = {0, 1, 20, 500, 100000};
for (size_t bytes : kBytesSize) {
SCOPED_TRACE(bytes);
std::string sample_string = TestString(bytes);
std::string target_buffer = std::string(bytes, '\0');
......
......@@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
return _mm_crc32_u64(crc, v);
return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
}
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
......@@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
return _mm_set_epi64x(l, r);
return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
}
template <int imm>
......
......@@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
constexpr uint32_t kCrc32Xor = 0xffffffffU;
*crc ^= kCrc32Xor;
if (n & 1) {
*crc = CRC32_u8(*crc, *p);
*crc = CRC32_u8(*crc, static_cast<uint8_t>(*p));
n--;
p++;
}
......
......@@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32};
}
constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
template <int vec_regions, int int_regions>
template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
std::array<__m128i, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int i = 0; i < vec_regions; i++) {
int region = i;
for (size_t i = 0; i < vec_regions; i++) {
size_t region = i;
auto* vsrc =
reinterpret_cast<const __m128i*>(*src + region_size * region);
......@@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
_mm_store_si128(vdst, data[i]);
// Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))};
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
}
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int i = 0; i < int_regions; i++) {
int region = vec_regions + i;
for (size_t i = 0; i < int_regions; i++) {
size_t region = vec_regions + i;
auto* usrc =
reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int j = 0; j < kIntLoadsPerVec; j++) {
int data_index = i * kIntLoadsPerVec + j;
for (size_t j = 0; j < kIntLoadsPerVec; j++) {
size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
......@@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
} // namespace
template <int vec_regions, int int_regions>
template <size_t vec_regions, size_t int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
public:
AcceleratedCrcMemcpyEngine() = default;
......@@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
std::size_t length, crc32c_t initial_crc) const override;
};
template <int vec_regions, int int_regions>
template <size_t vec_regions, size_t int_regions>
crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
......@@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Start work on the CRC: undo the XOR from the previous calculation or set up
// the initial value of the CRC.
// initial_crc ^= kCrcDataXor;
initial_crc = initial_crc ^ kCrcDataXor;
initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
// Do an initial alignment copy, so we can use aligned store instructions to
// the destination pointer. We align the destination pointer because the
......@@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Initialize CRCs for kRegions regions.
crc32c_t crcs[kRegions];
crcs[0] = initial_crc;
for (int i = 1; i < kRegions; i++) {
crcs[i] = kCrcDataXor;
for (size_t i = 1; i < kRegions; i++) {
crcs[i] = crc32c_t{kCrcDataXor};
}
// Find the number of rounds to copy and the region size. Also compute the
// tail size here.
int64_t copy_rounds = length / kCopyRoundSize;
size_t copy_rounds = length / kCopyRoundSize;
// Find the size of each region and the size of the tail.
const std::size_t region_size = copy_rounds * kBlockSize;
......@@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Main loop.
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int i = 0; i < kRegions; i++) {
for (size_t i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i);
absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
......@@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
}
// Load and store data, computing CRC on the way.
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int i = 0; i < kBlocksPerCacheLine; i++) {
for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
// Copy and CRC the data for the CRC regions.
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int j = 0; j < vec_regions; j++) {
for (size_t j = 0; j < vec_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
int region = (j + i) % kRegions;
size_t region = (j + i) % kRegions;
auto* src =
auto* vsrc =
reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
auto* dst =
auto* vdst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data.
vec_data[j] = _mm_loadu_si128(src + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(vec_data[j], 0)))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(vec_data[j], 1)))};
vec_data[j] = _mm_loadu_si128(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
// Store the data.
_mm_store_si128(dst + i, vec_data[j]);
_mm_store_si128(vdst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int j = 0; j < int_regions; j++) {
for (size_t j = 0; j < int_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
int region = (j + vec_regions + i) % kRegions;
size_t region = (j + vec_regions + i) % kRegions;
auto* usrc =
reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
#ifdef __GNUC__
#pragma unroll_completely
#endif
for (int k = 0; k < kIntLoadsPerVec; k++) {
int data_index = j * kIntLoadsPerVec + k;
for (size_t k = 0; k < kIntLoadsPerVec; k++) {
size_t data_index = j * kIntLoadsPerVec + k;
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
......@@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
// XOR done before doing block copy + CRCs.
for (int i = 0; i < kRegions - 1; i++) {
crcs[i] = crcs[i] ^ kCrcDataXor;
for (size_t i = 0; i + 1 < kRegions; i++) {
crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
}
// Build a CRC of the first kRegions - 1 regions.
crc32c_t full_crc = crcs[0];
for (int i = 1; i < kRegions - 1; i++) {
for (size_t i = 1; i + 1 < kRegions; i++) {
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
......@@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
crcs[kRegions - 1]);
// Finalize and concatenate the final CRC, then return.
crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
crcs[kRegions - 1] =
crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}
......
......@@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
int header_len = (std::min)(bytes_before_alignment_boundary, len);
size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
......@@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
__m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
uint64_t loops = len / kCacheLineSize;
size_t loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m128i temp1, temp2, temp3, temp4;
......@@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
int header_len = (std::min)(bytes_before_alignment_boundary, len);
size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
......@@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
__m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
int loops = len / kCacheLineSize;
size_t loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m256i temp1, temp2;
......
......@@ -77,6 +77,7 @@ for std in ${STD}; do
--copt="--gcc-toolchain=/usr/local" \
--copt="-DGTEST_REMOVE_LEGACY_TEST_CASEAPI_=1" \
--copt="${exceptions_mode}" \
--copt="-march=haswell" \
--copt=-Werror \
--define="absl=1" \
--distdir="/bazel-distdir" \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment