Add support for ARM intrinsics in crc_memcpy

This change replaces inline x86 intrinsics with generic versions that compile for both x86 and ARM depending on the target arch. This change does not enable the accelerated crc memcpy engine on ARM. That will be done in a subsequent change after the optimal number of vector and integer regions for different CPUs is determined. PiperOrigin-RevId: 562785420 Change-Id: I8ba4aa8de17587cedd92532f03767059a481f159

Add support for ARM intrinsics in crc_memcpy
This change replaces inline x86 intrinsics with generic versions that compile for both x86 and ARM depending on the target arch. This change does not enable the accelerated crc memcpy engine on ARM. That will be done in a subsequent change after the optimal number of vector and integer regions for different CPUs is determined. PiperOrigin-RevId: 562785420 Change-Id: I8ba4aa8de17587cedd92532f03767059a481f159
1a882833 · Abseil Team · Copybara-Service · fc44fa05 · 1a882833 · 1a882833
Commit 1a882833 authored Sep 05, 2023 by Abseil Team Committed by Copybara-Service Sep 05, 2023
7 changed files
--- a/CMake/AbseilDll.cmake
+++ b/CMake/AbseilDll.cmake
@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES
  "crc/internal/crc_x86_arm_combined.cc"
  "crc/internal/crc_memcpy_fallback.cc"
  "crc/internal/crc_memcpy.h"
-  "crc/internal/crc_memcpy_x86_64.cc"
+  "crc/internal/crc_memcpy_x86_arm_combined.cc"
  "crc/internal/crc_non_temporal_memcpy.cc"
  "crc/internal/crc_x86_arm_combined.cc"
  "crc/internal/non_temporal_arm_intrinsics.h"

--- a/absl/crc/BUILD.bazel
+++ b/absl/crc/BUILD.bazel
@@ -54,10 +54,8 @@ cc_library(
    visibility = ["//visibility:private"],
    deps = [
        ":cpu_detect",
-        "//absl/base",
        "//absl/base:config",
        "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
        "//absl/base:endian",
        "//absl/base:prefetch",
        "//absl/base:raw_logging_internal",
@@ -72,7 +70,7 @@ cc_library(
        "crc32c.cc",
        "internal/crc32c_inline.h",
        "internal/crc_memcpy_fallback.cc",
-        "internal/crc_memcpy_x86_64.cc",
+        "internal/crc_memcpy_x86_arm_combined.cc",
        "internal/crc_non_temporal_memcpy.cc",
    ],
    hdrs = [
@@ -89,7 +87,6 @@ cc_library(
        ":non_temporal_memcpy",
        "//absl/base:config",
        "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
        "//absl/base:endian",
        "//absl/base:prefetch",
        "//absl/strings",

--- a/absl/crc/CMakeLists.txt
+++ b/absl/crc/CMakeLists.txt
@@ -42,10 +42,8 @@ absl_cc_library(
    ${ABSL_DEFAULT_COPTS}
  DEPS
    absl::crc_cpu_detect
-    absl::base
    absl::config
    absl::core_headers
-    absl::dynamic_annotations
    absl::endian
    absl::prefetch
    absl::raw_logging_internal
@@ -64,7 +62,7 @@ absl_cc_library(
    "crc32c.cc"
    "internal/crc32c_inline.h"
    "internal/crc_memcpy_fallback.cc"
-    "internal/crc_memcpy_x86_64.cc"
+    "internal/crc_memcpy_x86_arm_combined.cc"
    "internal/crc_non_temporal_memcpy.cc"
  COPTS
    ${ABSL_DEFAULT_COPTS}
@@ -74,7 +72,6 @@ absl_cc_library(
    absl::non_temporal_memcpy
    absl::config
    absl::core_headers
-    absl::dynamic_annotations
    absl::endian
    absl::prefetch
    absl::str_format

--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -58,8 +58,10 @@ namespace crc_internal {
 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
 using V128 = uint64x2_t;
+using V128u = uint64x2_t;
 #else
 using V128 = __m128i;
+using V128u = __m128i_u;
 #endif
 // Starting with the initial value in |crc|, accumulates a CRC32 value for
@@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
 V128 V128_Load(const V128* src);
 // Load 128 bits of integer data. |src| does not need to be aligned.
-V128 V128_LoadU(const V128* src);
+V128 V128_LoadU(const V128u* src);
+// Store 128 bits of integer data. |src| must be 16-byte aligned.
+void V128_Store(V128* dst, V128 data);
 // Polynomially multiplies the high 64 bits of |l| and |r|.
 V128 V128_PMulHi(const V128 l, const V128 r);
@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l);
 template <int imm>
 int V128_Extract32(const V128 l);
+// Extracts a 64-bit integer from |l|, selected with |imm|.
+template <int imm>
+uint64_t V128_Extract64(const V128 l);
 // Extracts the low 64 bits from V128.
 int64_t V128_Low64(const V128 l);
@@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
-inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
+inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
 inline V128 V128_PMulHi(const V128 l, const V128 r) {
  return _mm_clmulepi64_si128(l, r, 0x11);
@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) {
  return _mm_extract_epi32(l, imm);
 }
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+  return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
+}
 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
@@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) {
  return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
-inline V128 V128_LoadU(const V128* src) {
+inline V128 V128_LoadU(const V128u* src) {
  return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
+inline void V128_Store(V128* dst, V128 data) {
+  vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
+}
 // Using inline assembly as clang does not generate the pmull2 instruction and
 // performance drops by 15-20%.
 // TODO(b/193678732): Investigate why the compiler decides not to generate
@@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) {
  return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
 }
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+  return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
+}
 inline int64_t V128_Low64(const V128 l) {
  return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
 }

--- a/absl/crc/internal/crc_memcpy.h
+++ b/absl/crc/internal/crc_memcpy.h
@@ -20,12 +20,15 @@
 #include "absl/base/config.h"
 #include "absl/crc/crc32c.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 // Defined if the class AcceleratedCrcMemcpyEngine exists.
 #if defined(__x86_64__) && defined(__SSE4_2__)
 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
 #elif defined(_MSC_VER) && defined(__AVX__)
 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
 #endif
 namespace absl {

--- a/absl/crc/internal/crc_memcpy_fallback.cc
+++ b/absl/crc/internal/crc_memcpy_fallback.cc
@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
 }
 // Compile the following only if we don't have
-#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
+    !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
  CrcMemcpy::ArchSpecificEngines engines;
@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
  return std::make_unique<FallbackCrcMemcpyEngine>();
 }
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
+        // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
 }  // namespace crc_internal
 ABSL_NAMESPACE_END

--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// Simultaneous memcopy and CRC-32C for x86-64.  Uses integer registers because
+// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
-// XMM registers do not support the CRC instruction (yet).  While copying,
+// registers because XMM registers do not support the CRC instruction (yet).
-// compute the running CRC of the data being copied.
+// While copying, compute the running CRC of the data being copied.
 //
 // It is assumed that any CPU running this code has SSE4.2 instructions
 // available (for CRC32C).  This file will do nothing if that is not true.
@@ -57,10 +57,12 @@
 #include "absl/base/prefetch.h"
 #include "absl/crc/crc32c.h"
 #include "absl/crc/internal/cpu_detect.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 #include "absl/crc/internal/crc_memcpy.h"
 #include "absl/strings/string_view.h"
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
+    defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
 namespace absl {
 ABSL_NAMESPACE_BEGIN
@@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
  uint32_t crc_uint32 = static_cast<uint32_t>(crc);
  for (std::size_t i = 0; i < length; i++) {
    uint8_t data = *reinterpret_cast<const uint8_t*>(src);
-    crc_uint32 = _mm_crc32_u8(crc_uint32, data);
+    crc_uint32 = CRC32_u8(crc_uint32, data);
    *reinterpret_cast<uint8_t*>(dst) = data;
    ++src;
    ++dst;
@@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
  return crc32c_t{crc_uint32};
 }
-constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
 // Common function for copying the tails of multiple large regions.
 template <size_t vec_regions, size_t int_regions>
 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                          size_t region_size, size_t copy_rounds) {
-  std::array<__m128i, vec_regions> data;
+  std::array<V128, vec_regions> data;
  std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
  while (copy_rounds > 0) {
    for (size_t i = 0; i < vec_regions; i++) {
      size_t region = i;
-      auto* vsrc =
+      auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
-          reinterpret_cast<const __m128i*>(*src + region_size * region);
+      auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
-      auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
      // Load the blocks, unaligned
-      data[i] = _mm_loadu_si128(vsrc);
+      data[i] = V128_LoadU(vsrc);
      // Store the blocks, aligned
-      _mm_store_si128(vdst, data[i]);
+      V128_Store(vdst, data[i]);
      // Compute the running CRC
      crcs[region] = crc32c_t{static_cast<uint32_t>(
-          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+                    static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
      crcs[region] = crc32c_t{static_cast<uint32_t>(
-          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
+                    static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
    }
    for (size_t i = 0; i < int_regions; i++) {
@@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
        size_t data_index = i * kIntLoadsPerVec + j;
        int_data[data_index] = *(usrc + j);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+        crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
            static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
        *(udst + j) = int_data[data_index];
@@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
    }
    // Increment pointers
-    *src += sizeof(__m128i);
+    *src += sizeof(V128);
-    *dst += sizeof(__m128i);
+    *dst += sizeof(V128);
    --copy_rounds;
  }
 }
@@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
  constexpr std::size_t kRegions = vec_regions + int_regions;
  static_assert(kRegions > 0, "Must specify at least one region.");
  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
-  constexpr std::size_t kBlockSize = sizeof(__m128i);
+  constexpr std::size_t kBlockSize = sizeof(V128);
  constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
  // Number of blocks per cacheline.
@@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
  const std::size_t tail_size = length - (kRegions * region_size);
  // Holding registers for data in each region.
-  std::array<__m128i, vec_regions> vec_data;
+  std::array<V128, vec_regions> vec_data;
  std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
  // Main loop.
@@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
    // Prefetch kPrefetchAhead bytes ahead of each pointer.
    for (size_t i = 0; i < kRegions; i++) {
      absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
+#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+      // TODO(b/297082454): investigate dropping prefetch on x86.
      absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
+#endif
    }
    // Load and store data, computing CRC on the way.
@@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
        size_t region = (j + i) % kRegions;
        auto* vsrc =
-            reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
+            reinterpret_cast<const V128u*>(src_bytes + region_size * region);
-        auto* vdst =
+        auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
-            reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
        // Load and CRC data.
-        vec_data[j] = _mm_loadu_si128(vsrc + i);
+        vec_data[j] = V128_LoadU(vsrc + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            static_cast<uint32_t>(crcs[region]),
+            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+                      static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            static_cast<uint32_t>(crcs[region]),
+            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
+                      static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
        // Store the data.
-        _mm_store_si128(vdst + i, vec_data[j]);
+        V128_Store(vdst + i, vec_data[j]);
      }
      // Preload the partial CRCs for the CLMUL subregions.
@@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
          // Load and CRC the data.
          int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
-          crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+          crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
              static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
          // Store the data.
@@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
 ABSL_NAMESPACE_END
 }  // namespace absl
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
+        // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE