Fixes many compilation issues that come from having no external CI

coverage of the accelerated CRC implementation and some differences bewteen the internal and external implementation. This change adds CI coverage to the linux_clang-latest_libstdcxx_bazel.sh script assuming this script always runs on machines of at least the Intel Haswell generation. Fixes include: * Remove the use of the deprecated xor operator on crc32c_t * Remove #pragma unroll_completely, which isn't known by GCC or Clang: https://godbolt.org/z/97j4vbacs * Fixes for -Wsign-compare, -Wsign-conversion and -Wshorten-64-to-32 PiperOrigin-RevId: 491965029 Change-Id: Ic5e1f3a20f69fcd35fe81ebef63443ad26bf7931

Fixes many compilation issues that come from having no external CI
coverage of the accelerated CRC implementation and some differences bewteen the internal and external implementation. This change adds CI coverage to the linux_clang-latest_libstdcxx_bazel.sh script assuming this script always runs on machines of at least the Intel Haswell generation. Fixes include: * Remove the use of the deprecated xor operator on crc32c_t * Remove #pragma unroll_completely, which isn't known by GCC or Clang: https://godbolt.org/z/97j4vbacs * Fixes for -Wsign-compare, -Wsign-conversion and -Wshorten-64-to-32 PiperOrigin-RevId: 491965029 Change-Id: Ic5e1f3a20f69fcd35fe81ebef63443ad26bf7931
66665d8d · Derek Mauro · Copybara-Service · 94e9ee3f · 66665d8d · 66665d8d
Commit 66665d8d authored Nov 30, 2022 by Derek Mauro Committed by Copybara-Service Nov 30, 2022
7 changed files
--- a/absl/crc/crc32c_test.cc
+++ b/absl/crc/crc32c_test.cc
@@ -15,6 +15,7 @@
 #include "absl/crc/crc32c.h"

 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <string>
@@ -90,7 +91,8 @@ TEST(CRC32C, ExtendByZeroes) {
  std::string base = "hello world";
  absl::crc32c_t base_crc = absl::crc32c_t{0xc99465aa};

-  for (const size_t extend_by : {100, 10000, 100000}) {
+  constexpr size_t kExtendByValues[] = {100, 10000, 100000};
+  for (const size_t extend_by : kExtendByValues) {
    SCOPED_TRACE(extend_by);
    absl::crc32c_t crc2 = absl::ExtendCrc32cByZeroes(base_crc, extend_by);
    EXPECT_EQ(crc2, absl::ComputeCrc32c(base + std::string(extend_by, '\0')));
@@ -98,10 +100,13 @@ TEST(CRC32C, ExtendByZeroes) {
 }

 TEST(CRC32C, UnextendByZeroes) {
+  constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000};
+  constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000};
+
  for (auto seed_crc : {absl::crc32c_t{0}, absl::crc32c_t{0xc99465aa}}) {
    SCOPED_TRACE(seed_crc);
-    for (const size_t size_1 : {2, 200, 20000, 200000, 20000000}) {
-      for (const size_t size_2 : {0, 100, 10000, 100000, 10000000}) {
+    for (const size_t size_1 : kExtendByValues) {
+      for (const size_t size_2 : kUnextendByValues) {
        size_t extend_size = std::max(size_1, size_2);
        size_t unextend_size = std::min(size_1, size_2);
        SCOPED_TRACE(extend_size);
@@ -120,7 +125,9 @@ TEST(CRC32C, UnextendByZeroes) {
      }
    }
  }
-  for (const size_t size : {0, 1, 100, 10000}) {
+
+  constexpr size_t kSizes[] = {0, 1, 100, 10000};
+  for (const size_t size : kSizes) {
    SCOPED_TRACE(size);
    std::string string_before = TestString(size);
    std::string string_after = string_before + std::string(size, '\0');
@@ -146,7 +153,8 @@ TEST(CRC32C, Concat) {
 }

 TEST(CRC32C, Memcpy) {
-  for (size_t bytes : {0, 1, 20, 500, 100000}) {
+  constexpr size_t kBytesSize[] = {0, 1, 20, 500, 100000};
+  for (size_t bytes : kBytesSize) {
    SCOPED_TRACE(bytes);
    std::string sample_string = TestString(bytes);
    std::string target_buffer = std::string(bytes, '\0');

--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
 }

 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
-  return _mm_crc32_u64(crc, v);
+  return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
 }

 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
@@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }

 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
-  return _mm_set_epi64x(l, r);
+  return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
 }

 template <int imm>

--- a/absl/crc/internal/crc32c_inline.h
+++ b/absl/crc/internal/crc32c_inline.h
@@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
  constexpr uint32_t kCrc32Xor = 0xffffffffU;
  *crc ^= kCrc32Xor;
  if (n & 1) {
-    *crc = CRC32_u8(*crc, *p);
+    *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p));
    n--;
    p++;
  }

--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
  return crc32c_t{crc_uint32};
 }

-constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);

 // Common function for copying the tails of multiple large regions.
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                          size_t region_size, size_t copy_rounds) {
  std::array<__m128i, vec_regions> data;
  std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;

  while (copy_rounds > 0) {
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < vec_regions; i++) {
-      int region = i;
+    for (size_t i = 0; i < vec_regions; i++) {
+      size_t region = i;

      auto* vsrc =
          reinterpret_cast<const __m128i*>(*src + region_size * region);
@@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
      _mm_store_si128(vdst, data[i]);

      // Compute the running CRC
-      crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-          static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))};
-      crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-          static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))};
+      crcs[region] = crc32c_t{static_cast<uint32_t>(
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+      crcs[region] = crc32c_t{static_cast<uint32_t>(
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
    }

-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < int_regions; i++) {
-      int region = vec_regions + i;
+    for (size_t i = 0; i < int_regions; i++) {
+      size_t region = vec_regions + i;

      auto* usrc =
          reinterpret_cast<const uint64_t*>(*src + region_size * region);
      auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);

-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < kIntLoadsPerVec; j++) {
-        int data_index = i * kIntLoadsPerVec + j;
+      for (size_t j = 0; j < kIntLoadsPerVec; j++) {
+        size_t data_index = i * kIntLoadsPerVec + j;

        int_data[data_index] = *(usrc + j);
        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
@@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,

 }  // namespace

-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
 public:
  AcceleratedCrcMemcpyEngine() = default;
@@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
                   std::size_t length, crc32c_t initial_crc) const override;
 };

-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
    void* __restrict dst, const void* __restrict src, std::size_t length,
    crc32c_t initial_crc) const {
  constexpr std::size_t kRegions = vec_regions + int_regions;
-  constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
+  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
  constexpr std::size_t kBlockSize = sizeof(__m128i);
  constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;

@@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
  // Start work on the CRC: undo the XOR from the previous calculation or set up
  // the initial value of the CRC.
  // initial_crc ^= kCrcDataXor;
-  initial_crc = initial_crc ^ kCrcDataXor;
+  initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};

  // Do an initial alignment copy, so we can use aligned store instructions to
  // the destination pointer.  We align the destination pointer because the
@@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
  // Initialize CRCs for kRegions regions.
  crc32c_t crcs[kRegions];
  crcs[0] = initial_crc;
-  for (int i = 1; i < kRegions; i++) {
-    crcs[i] = kCrcDataXor;
+  for (size_t i = 1; i < kRegions; i++) {
+    crcs[i] = crc32c_t{kCrcDataXor};
  }

  // Find the number of rounds to copy and the region size.  Also compute the
  // tail size here.
-  int64_t copy_rounds = length / kCopyRoundSize;
+  size_t copy_rounds = length / kCopyRoundSize;

  // Find the size of each region and the size of the tail.
  const std::size_t region_size = copy_rounds * kBlockSize;
@@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
  // Main loop.
  while (copy_rounds > kBlocksPerCacheLine) {
    // Prefetch kPrefetchAhead bytes ahead of each pointer.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < kRegions; i++) {
+    for (size_t i = 0; i < kRegions; i++) {
      absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
                                      region_size * i);
      absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
@@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
    }

    // Load and store data, computing CRC on the way.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < kBlocksPerCacheLine; i++) {
+    for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
      // Copy and CRC the data for the CRC regions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < vec_regions; j++) {
+      for (size_t j = 0; j < vec_regions; j++) {
        // Cycle which regions get vector load/store and integer load/store, to
        // engage prefetching logic around vector load/stores and save issue
        // slots by using the integer registers.
-        int region = (j + i) % kRegions;
+        size_t region = (j + i) % kRegions;

-        auto* src =
+        auto* vsrc =
            reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
-        auto* dst =
+        auto* vdst =
            reinterpret_cast<__m128i*>(dst_bytes + region_size * region);

        // Load and CRC data.
-        vec_data[j] = _mm_loadu_si128(src + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                          _mm_extract_epi64(vec_data[j], 0)))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                          _mm_extract_epi64(vec_data[j], 1)))};
+        vec_data[j] = _mm_loadu_si128(vsrc + i);
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};

        // Store the data.
-        _mm_store_si128(dst + i, vec_data[j]);
+        _mm_store_si128(vdst + i, vec_data[j]);
      }

      // Preload the partial CRCs for the CLMUL subregions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < int_regions; j++) {
+      for (size_t j = 0; j < int_regions; j++) {
        // Cycle which regions get vector load/store and integer load/store, to
        // engage prefetching logic around vector load/stores and save issue
        // slots by using the integer registers.
-        int region = (j + vec_regions + i) % kRegions;
+        size_t region = (j + vec_regions + i) % kRegions;

        auto* usrc =
            reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
        auto* udst =
            reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);

-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-        for (int k = 0; k < kIntLoadsPerVec; k++) {
-          int data_index = j * kIntLoadsPerVec + k;
+        for (size_t k = 0; k < kIntLoadsPerVec; k++) {
+          size_t data_index = j * kIntLoadsPerVec + k;

          // Load and CRC the data.
          int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
@@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(

  // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
  // XOR done before doing block copy + CRCs.
-  for (int i = 0; i < kRegions - 1; i++) {
-    crcs[i] = crcs[i] ^ kCrcDataXor;
+  for (size_t i = 0; i + 1 < kRegions; i++) {
+    crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
  }

  // Build a CRC of the first kRegions - 1 regions.
  crc32c_t full_crc = crcs[0];
-  for (int i = 1; i < kRegions - 1; i++) {
+  for (size_t i = 1; i + 1 < kRegions; i++) {
    full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
  }

@@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
                   crcs[kRegions - 1]);

  // Finalize and concatenate the final CRC, then return.
-  crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
+  crcs[kRegions - 1] =
+      crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
  return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
 }


--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
--- a/absl/crc/internal/non_temporal_memcpy.h
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
    uintptr_t bytes_before_alignment_boundary =
        kCacheLineSize -
        (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
-    int header_len = (std::min)(bytes_before_alignment_boundary, len);
+    size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
    assert(bytes_before_alignment_boundary < kCacheLineSize);
    memcpy(d, s, header_len);
    d += header_len;
@@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
    __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
    const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
    constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
-    uint64_t loops = len / kCacheLineSize;
+    size_t loops = len / kCacheLineSize;

    while (len >= kCacheLineSize) {
      __m128i temp1, temp2, temp3, temp4;
@@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
    uintptr_t bytes_before_alignment_boundary =
        kCacheLineSize -
        (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
-    int header_len = (std::min)(bytes_before_alignment_boundary, len);
+    size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
    assert(bytes_before_alignment_boundary < kCacheLineSize);
    memcpy(d, s, header_len);
    d += header_len;
@@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
    __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
    const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
    constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
-    int loops = len / kCacheLineSize;
+    size_t loops = len / kCacheLineSize;

    while (len >= kCacheLineSize) {
      __m256i temp1, temp2;

--- a/ci/linux_clang-latest_libstdcxx_bazel.sh
+++ b/ci/linux_clang-latest_libstdcxx_bazel.sh
@@ -77,6 +77,7 @@ for std in ${STD}; do
          --copt="--gcc-toolchain=/usr/local" \
          --copt="-DGTEST_REMOVE_LEGACY_TEST_CASEAPI_=1" \
          --copt="${exceptions_mode}" \
+          --copt="-march=haswell" \
          --copt=-Werror \
          --define="absl=1" \
          --distdir="/bazel-distdir" \