Commit 1a882833 by Abseil Team Committed by Copybara-Service

Add support for ARM intrinsics in crc_memcpy

This change replaces inline x86 intrinsics with generic versions that compile
for both x86 and ARM depending on the target arch.

This change does not enable the accelerated crc memcpy engine on ARM. That will
be done in a subsequent change after the optimal number of vector and integer
regions for different CPUs is determined.

PiperOrigin-RevId: 562785420
Change-Id: I8ba4aa8de17587cedd92532f03767059a481f159
parent fc44fa05
...@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES ...@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES
"crc/internal/crc_x86_arm_combined.cc" "crc/internal/crc_x86_arm_combined.cc"
"crc/internal/crc_memcpy_fallback.cc" "crc/internal/crc_memcpy_fallback.cc"
"crc/internal/crc_memcpy.h" "crc/internal/crc_memcpy.h"
"crc/internal/crc_memcpy_x86_64.cc" "crc/internal/crc_memcpy_x86_arm_combined.cc"
"crc/internal/crc_non_temporal_memcpy.cc" "crc/internal/crc_non_temporal_memcpy.cc"
"crc/internal/crc_x86_arm_combined.cc" "crc/internal/crc_x86_arm_combined.cc"
"crc/internal/non_temporal_arm_intrinsics.h" "crc/internal/non_temporal_arm_intrinsics.h"
......
...@@ -54,10 +54,8 @@ cc_library( ...@@ -54,10 +54,8 @@ cc_library(
visibility = ["//visibility:private"], visibility = ["//visibility:private"],
deps = [ deps = [
":cpu_detect", ":cpu_detect",
"//absl/base",
"//absl/base:config", "//absl/base:config",
"//absl/base:core_headers", "//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian", "//absl/base:endian",
"//absl/base:prefetch", "//absl/base:prefetch",
"//absl/base:raw_logging_internal", "//absl/base:raw_logging_internal",
...@@ -72,7 +70,7 @@ cc_library( ...@@ -72,7 +70,7 @@ cc_library(
"crc32c.cc", "crc32c.cc",
"internal/crc32c_inline.h", "internal/crc32c_inline.h",
"internal/crc_memcpy_fallback.cc", "internal/crc_memcpy_fallback.cc",
"internal/crc_memcpy_x86_64.cc", "internal/crc_memcpy_x86_arm_combined.cc",
"internal/crc_non_temporal_memcpy.cc", "internal/crc_non_temporal_memcpy.cc",
], ],
hdrs = [ hdrs = [
...@@ -89,7 +87,6 @@ cc_library( ...@@ -89,7 +87,6 @@ cc_library(
":non_temporal_memcpy", ":non_temporal_memcpy",
"//absl/base:config", "//absl/base:config",
"//absl/base:core_headers", "//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian", "//absl/base:endian",
"//absl/base:prefetch", "//absl/base:prefetch",
"//absl/strings", "//absl/strings",
......
...@@ -42,10 +42,8 @@ absl_cc_library( ...@@ -42,10 +42,8 @@ absl_cc_library(
${ABSL_DEFAULT_COPTS} ${ABSL_DEFAULT_COPTS}
DEPS DEPS
absl::crc_cpu_detect absl::crc_cpu_detect
absl::base
absl::config absl::config
absl::core_headers absl::core_headers
absl::dynamic_annotations
absl::endian absl::endian
absl::prefetch absl::prefetch
absl::raw_logging_internal absl::raw_logging_internal
...@@ -64,7 +62,7 @@ absl_cc_library( ...@@ -64,7 +62,7 @@ absl_cc_library(
"crc32c.cc" "crc32c.cc"
"internal/crc32c_inline.h" "internal/crc32c_inline.h"
"internal/crc_memcpy_fallback.cc" "internal/crc_memcpy_fallback.cc"
"internal/crc_memcpy_x86_64.cc" "internal/crc_memcpy_x86_arm_combined.cc"
"internal/crc_non_temporal_memcpy.cc" "internal/crc_non_temporal_memcpy.cc"
COPTS COPTS
${ABSL_DEFAULT_COPTS} ${ABSL_DEFAULT_COPTS}
...@@ -74,7 +72,6 @@ absl_cc_library( ...@@ -74,7 +72,6 @@ absl_cc_library(
absl::non_temporal_memcpy absl::non_temporal_memcpy
absl::config absl::config
absl::core_headers absl::core_headers
absl::dynamic_annotations
absl::endian absl::endian
absl::prefetch absl::prefetch
absl::str_format absl::str_format
......
...@@ -58,8 +58,10 @@ namespace crc_internal { ...@@ -58,8 +58,10 @@ namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t; using V128 = uint64x2_t;
using V128u = uint64x2_t;
#else #else
using V128 = __m128i; using V128 = __m128i;
using V128u = __m128i_u;
#endif #endif
// Starting with the initial value in |crc|, accumulates a CRC32 value for // Starting with the initial value in |crc|, accumulates a CRC32 value for
...@@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v); ...@@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
V128 V128_Load(const V128* src); V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned. // Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128* src); V128 V128_LoadU(const V128u* src);
// Store 128 bits of integer data. |src| must be 16-byte aligned.
void V128_Store(V128* dst, V128 data);
// Polynomially multiplies the high 64 bits of |l| and |r|. // Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r); V128 V128_PMulHi(const V128 l, const V128 r);
...@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l); ...@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l);
template <int imm> template <int imm>
int V128_Extract32(const V128 l); int V128_Extract32(const V128 l);
// Extracts a 64-bit integer from |l|, selected with |imm|.
template <int imm>
uint64_t V128_Extract64(const V128 l);
// Extracts the low 64 bits from V128. // Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l); int64_t V128_Low64(const V128 l);
...@@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { ...@@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
inline V128 V128_PMulHi(const V128 l, const V128 r) { inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11); return _mm_clmulepi64_si128(l, r, 0x11);
...@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) { ...@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm); return _mm_extract_epi32(l, imm);
} }
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
}
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
...@@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) { ...@@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
} }
inline V128 V128_LoadU(const V128* src) { inline V128 V128_LoadU(const V128u* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
} }
inline void V128_Store(V128* dst, V128 data) {
vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
}
// Using inline assembly as clang does not generate the pmull2 instruction and // Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%. // performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate // TODO(b/193678732): Investigate why the compiler decides not to generate
...@@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) { ...@@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
} }
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
}
inline int64_t V128_Low64(const V128 l) { inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
} }
......
...@@ -20,12 +20,15 @@ ...@@ -20,12 +20,15 @@
#include "absl/base/config.h" #include "absl/base/config.h"
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
// Defined if the class AcceleratedCrcMemcpyEngine exists. // Defined if the class AcceleratedCrcMemcpyEngine exists.
#if defined(__x86_64__) && defined(__SSE4_2__) #if defined(__x86_64__) && defined(__SSE4_2__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(_MSC_VER) && defined(__AVX__) #elif defined(_MSC_VER) && defined(__AVX__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
#endif #endif
namespace absl { namespace absl {
......
...@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst, ...@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
} }
// Compile the following only if we don't have // Compile the following only if we don't have
#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE #if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
!defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines; CrcMemcpy::ArchSpecificEngines engines;
...@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/, ...@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
return std::make_unique<FallbackCrcMemcpyEngine>(); return std::make_unique<FallbackCrcMemcpyEngine>();
} }
#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE #endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
// !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
} // namespace crc_internal } // namespace crc_internal
ABSL_NAMESPACE_END ABSL_NAMESPACE_END
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because // Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
// XMM registers do not support the CRC instruction (yet). While copying, // registers because XMM registers do not support the CRC instruction (yet).
// compute the running CRC of the data being copied. // While copying, compute the running CRC of the data being copied.
// //
// It is assumed that any CPU running this code has SSE4.2 instructions // It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true. // available (for CRC32C). This file will do nothing if that is not true.
...@@ -57,10 +57,12 @@ ...@@ -57,10 +57,12 @@
#include "absl/base/prefetch.h" #include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h" #include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h" #include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h" #include "absl/strings/string_view.h"
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE #if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
namespace absl { namespace absl {
ABSL_NAMESPACE_BEGIN ABSL_NAMESPACE_BEGIN
...@@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, ...@@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
uint32_t crc_uint32 = static_cast<uint32_t>(crc); uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) { for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src); uint8_t data = *reinterpret_cast<const uint8_t*>(src);
crc_uint32 = _mm_crc32_u8(crc_uint32, data); crc_uint32 = CRC32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data; *reinterpret_cast<uint8_t*>(dst) = data;
++src; ++src;
++dst; ++dst;
...@@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, ...@@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32}; return crc32c_t{crc_uint32};
} }
constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions. // Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions> template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) { size_t region_size, size_t copy_rounds) {
std::array<__m128i, vec_regions> data; std::array<V128, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) { while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) { for (size_t i = 0; i < vec_regions; i++) {
size_t region = i; size_t region = i;
auto* vsrc = auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
reinterpret_cast<const __m128i*>(*src + region_size * region); auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
// Load the blocks, unaligned // Load the blocks, unaligned
data[i] = _mm_loadu_si128(vsrc); data[i] = V128_LoadU(vsrc);
// Store the blocks, aligned // Store the blocks, aligned
_mm_store_si128(vdst, data[i]); V128_Store(vdst, data[i]);
// Compute the running CRC // Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))}; static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))}; static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
} }
for (size_t i = 0; i < int_regions; i++) { for (size_t i = 0; i < int_regions; i++) {
...@@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t data_index = i * kIntLoadsPerVec + j; size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j); int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
*(udst + j) = int_data[data_index]; *(udst + j) = int_data[data_index];
...@@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
} }
// Increment pointers // Increment pointers
*src += sizeof(__m128i); *src += sizeof(V128);
*dst += sizeof(__m128i); *dst += sizeof(V128);
--copy_rounds; --copy_rounds;
} }
} }
...@@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
constexpr std::size_t kRegions = vec_regions + int_regions; constexpr std::size_t kRegions = vec_regions + int_regions;
static_assert(kRegions > 0, "Must specify at least one region."); static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(__m128i); constexpr std::size_t kBlockSize = sizeof(V128);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
// Number of blocks per cacheline. // Number of blocks per cacheline.
...@@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
const std::size_t tail_size = length - (kRegions * region_size); const std::size_t tail_size = length - (kRegions * region_size);
// Holding registers for data in each region. // Holding registers for data in each region.
std::array<__m128i, vec_regions> vec_data; std::array<V128, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data; std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
// Main loop. // Main loop.
...@@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Prefetch kPrefetchAhead bytes ahead of each pointer. // Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) { for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
// TODO(b/297082454): investigate dropping prefetch on x86.
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
#endif
} }
// Load and store data, computing CRC on the way. // Load and store data, computing CRC on the way.
...@@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
size_t region = (j + i) % kRegions; size_t region = (j + i) % kRegions;
auto* vsrc = auto* vsrc =
reinterpret_cast<const __m128i*>(src_bytes + region_size * region); reinterpret_cast<const V128u*>(src_bytes + region_size * region);
auto* vdst = auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data. // Load and CRC data.
vec_data[j] = _mm_loadu_si128(vsrc + i); vec_data[j] = V128_LoadU(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(
static_cast<uint32_t>(crcs[region]), CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))}; static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(
static_cast<uint32_t>(crcs[region]), CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))}; static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
// Store the data. // Store the data.
_mm_store_si128(vdst + i, vec_data[j]); V128_Store(vdst + i, vec_data[j]);
} }
// Preload the partial CRCs for the CLMUL subregions. // Preload the partial CRCs for the CLMUL subregions.
...@@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Load and CRC the data. // Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
// Store the data. // Store the data.
...@@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, ...@@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END ABSL_NAMESPACE_END
} // namespace absl } // namespace absl
#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
// ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment