Commit 461f1e49 by Abseil Team Committed by Copybara-Service

Rollback adding support for ARM intrinsics

In some configurations this change causes compilation errors. We will roll this
forward again after those issue are addressed.

PiperOrigin-RevId: 562810916
Change-Id: I45b2a8d456273e9eff188f36da8f11323c4dfe66
parent 1a882833
...@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES ...@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES
"crc/internal/crc_x86_arm_combined.cc" "crc/internal/crc_x86_arm_combined.cc"
"crc/internal/crc_memcpy_fallback.cc" "crc/internal/crc_memcpy_fallback.cc"
"crc/internal/crc_memcpy.h" "crc/internal/crc_memcpy.h"
"crc/internal/crc_memcpy_x86_arm_combined.cc" "crc/internal/crc_memcpy_x86_64.cc"
"crc/internal/crc_non_temporal_memcpy.cc" "crc/internal/crc_non_temporal_memcpy.cc"
"crc/internal/crc_x86_arm_combined.cc" "crc/internal/crc_x86_arm_combined.cc"
"crc/internal/non_temporal_arm_intrinsics.h" "crc/internal/non_temporal_arm_intrinsics.h"
......
...@@ -54,8 +54,10 @@ cc_library( ...@@ -54,8 +54,10 @@ cc_library(
visibility = ["//visibility:private"], visibility = ["//visibility:private"],
deps = [ deps = [
":cpu_detect", ":cpu_detect",
"//absl/base",
"//absl/base:config", "//absl/base:config",
"//absl/base:core_headers", "//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian", "//absl/base:endian",
"//absl/base:prefetch", "//absl/base:prefetch",
"//absl/base:raw_logging_internal", "//absl/base:raw_logging_internal",
...@@ -70,7 +72,7 @@ cc_library( ...@@ -70,7 +72,7 @@ cc_library(
"crc32c.cc", "crc32c.cc",
"internal/crc32c_inline.h", "internal/crc32c_inline.h",
"internal/crc_memcpy_fallback.cc", "internal/crc_memcpy_fallback.cc",
"internal/crc_memcpy_x86_arm_combined.cc", "internal/crc_memcpy_x86_64.cc",
"internal/crc_non_temporal_memcpy.cc", "internal/crc_non_temporal_memcpy.cc",
], ],
hdrs = [ hdrs = [
...@@ -87,6 +89,7 @@ cc_library( ...@@ -87,6 +89,7 @@ cc_library(
":non_temporal_memcpy", ":non_temporal_memcpy",
"//absl/base:config", "//absl/base:config",
"//absl/base:core_headers", "//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian", "//absl/base:endian",
"//absl/base:prefetch", "//absl/base:prefetch",
"//absl/strings", "//absl/strings",
......
...@@ -42,8 +42,10 @@ absl_cc_library( ...@@ -42,8 +42,10 @@ absl_cc_library(
${ABSL_DEFAULT_COPTS} ${ABSL_DEFAULT_COPTS}
DEPS DEPS
absl::crc_cpu_detect absl::crc_cpu_detect
absl::base
absl::config absl::config
absl::core_headers absl::core_headers
absl::dynamic_annotations
absl::endian absl::endian
absl::prefetch absl::prefetch
absl::raw_logging_internal absl::raw_logging_internal
...@@ -62,7 +64,7 @@ absl_cc_library( ...@@ -62,7 +64,7 @@ absl_cc_library(
"crc32c.cc" "crc32c.cc"
"internal/crc32c_inline.h" "internal/crc32c_inline.h"
"internal/crc_memcpy_fallback.cc" "internal/crc_memcpy_fallback.cc"
"internal/crc_memcpy_x86_arm_combined.cc" "internal/crc_memcpy_x86_64.cc"
"internal/crc_non_temporal_memcpy.cc" "internal/crc_non_temporal_memcpy.cc"
COPTS COPTS
${ABSL_DEFAULT_COPTS} ${ABSL_DEFAULT_COPTS}
...@@ -72,6 +74,7 @@ absl_cc_library( ...@@ -72,6 +74,7 @@ absl_cc_library(
absl::non_temporal_memcpy absl::non_temporal_memcpy
absl::config absl::config
absl::core_headers absl::core_headers
absl::dynamic_annotations
absl::endian absl::endian
absl::prefetch absl::prefetch
absl::str_format absl::str_format
......
...@@ -58,10 +58,8 @@ namespace crc_internal { ...@@ -58,10 +58,8 @@ namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t; using V128 = uint64x2_t;
using V128u = uint64x2_t;
#else #else
using V128 = __m128i; using V128 = __m128i;
using V128u = __m128i_u;
#endif #endif
// Starting with the initial value in |crc|, accumulates a CRC32 value for // Starting with the initial value in |crc|, accumulates a CRC32 value for
...@@ -78,10 +76,7 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v); ...@@ -78,10 +76,7 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
V128 V128_Load(const V128* src); V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned. // Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128u* src); V128 V128_LoadU(const V128* src);
// Store 128 bits of integer data. |src| must be 16-byte aligned.
void V128_Store(V128* dst, V128 data);
// Polynomially multiplies the high 64 bits of |l| and |r|. // Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r); V128 V128_PMulHi(const V128 l, const V128 r);
...@@ -114,10 +109,6 @@ V128 V128_ShiftRight(const V128 l); ...@@ -114,10 +109,6 @@ V128 V128_ShiftRight(const V128 l);
template <int imm> template <int imm>
int V128_Extract32(const V128 l); int V128_Extract32(const V128 l);
// Extracts a 64-bit integer from |l|, selected with |imm|.
template <int imm>
uint64_t V128_Extract64(const V128 l);
// Extracts the low 64 bits from V128. // Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l); int64_t V128_Low64(const V128 l);
...@@ -146,9 +137,7 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { ...@@ -146,9 +137,7 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); } inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
inline V128 V128_PMulHi(const V128 l, const V128 r) { inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11); return _mm_clmulepi64_si128(l, r, 0x11);
...@@ -184,11 +173,6 @@ inline int V128_Extract32(const V128 l) { ...@@ -184,11 +173,6 @@ inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm); return _mm_extract_epi32(l, imm);
} }
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
}
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
...@@ -215,14 +199,10 @@ inline V128 V128_Load(const V128* src) { ...@@ -215,14 +199,10 @@ inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
} }
inline V128 V128_LoadU(const V128u* src) { inline V128 V128_LoadU(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
} }
inline void V128_Store(V128* dst, V128 data) {
vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
}
// Using inline assembly as clang does not generate the pmull2 instruction and // Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%. // performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate // TODO(b/193678732): Investigate why the compiler decides not to generate
...@@ -272,11 +252,6 @@ inline int V128_Extract32(const V128 l) { ...@@ -272,11 +252,6 @@ inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
} }
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
}
inline int64_t V128_Low64(const V128 l) { inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
} }
......
...@@ -20,15 +20,12 @@ ...@@ -20,15 +20,12 @@
#include "absl/base/config.h" #include "absl/base/config.h"
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
// Defined if the class AcceleratedCrcMemcpyEngine exists. // Defined if the class AcceleratedCrcMemcpyEngine exists.
#if defined(__x86_64__) && defined(__SSE4_2__) #if defined(__x86_64__) && defined(__SSE4_2__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(_MSC_VER) && defined(__AVX__) #elif defined(_MSC_VER) && defined(__AVX__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
#endif #endif
namespace absl { namespace absl {
......
...@@ -54,8 +54,7 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst, ...@@ -54,8 +54,7 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
} }
// Compile the following only if we don't have // Compile the following only if we don't have
#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \ #ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
!defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines; CrcMemcpy::ArchSpecificEngines engines;
...@@ -69,8 +68,7 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/, ...@@ -69,8 +68,7 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
return std::make_unique<FallbackCrcMemcpyEngine>(); return std::make_unique<FallbackCrcMemcpyEngine>();
} }
#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE && #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
// !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
} // namespace crc_internal } // namespace crc_internal
ABSL_NAMESPACE_END ABSL_NAMESPACE_END
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer // Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
// registers because XMM registers do not support the CRC instruction (yet). // XMM registers do not support the CRC instruction (yet). While copying,
// While copying, compute the running CRC of the data being copied. // compute the running CRC of the data being copied.
// //
// It is assumed that any CPU running this code has SSE4.2 instructions // It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true. // available (for CRC32C). This file will do nothing if that is not true.
...@@ -57,12 +57,10 @@ ...@@ -57,12 +57,10 @@
#include "absl/base/prefetch.h" #include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h" #include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h" #include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h" #include "absl/strings/string_view.h"
#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \ #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
namespace absl { namespace absl {
ABSL_NAMESPACE_BEGIN ABSL_NAMESPACE_BEGIN
...@@ -77,7 +75,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, ...@@ -77,7 +75,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
uint32_t crc_uint32 = static_cast<uint32_t>(crc); uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) { for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src); uint8_t data = *reinterpret_cast<const uint8_t*>(src);
crc_uint32 = CRC32_u8(crc_uint32, data); crc_uint32 = _mm_crc32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data; *reinterpret_cast<uint8_t*>(dst) = data;
++src; ++src;
++dst; ++dst;
...@@ -85,35 +83,36 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, ...@@ -85,35 +83,36 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32}; return crc32c_t{crc_uint32};
} }
constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t); constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions. // Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions> template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) { size_t region_size, size_t copy_rounds) {
std::array<V128, vec_regions> data; std::array<__m128i, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) { while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) { for (size_t i = 0; i < vec_regions; i++) {
size_t region = i; size_t region = i;
auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region); auto* vsrc =
auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region); reinterpret_cast<const __m128i*>(*src + region_size * region);
auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
// Load the blocks, unaligned // Load the blocks, unaligned
data[i] = V128_LoadU(vsrc); data[i] = _mm_loadu_si128(vsrc);
// Store the blocks, aligned // Store the blocks, aligned
V128_Store(vdst, data[i]); _mm_store_si128(vdst, data[i]);
// Compute the running CRC // Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]), _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(data[i]))))}; static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]), _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(data[i]))))}; static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
} }
for (size_t i = 0; i < int_regions; i++) { for (size_t i = 0; i < int_regions; i++) {
...@@ -127,7 +126,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -127,7 +126,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t data_index = i * kIntLoadsPerVec + j; size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j); int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
*(udst + j) = int_data[data_index]; *(udst + j) = int_data[data_index];
...@@ -135,8 +134,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -135,8 +134,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
} }
// Increment pointers // Increment pointers
*src += sizeof(V128); *src += sizeof(__m128i);
*dst += sizeof(V128); *dst += sizeof(__m128i);
--copy_rounds; --copy_rounds;
} }
} }
...@@ -162,7 +161,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -162,7 +161,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
constexpr std::size_t kRegions = vec_regions + int_regions; constexpr std::size_t kRegions = vec_regions + int_regions;
static_assert(kRegions > 0, "Must specify at least one region."); static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(V128); constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
// Number of blocks per cacheline. // Number of blocks per cacheline.
...@@ -238,7 +237,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -238,7 +237,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
const std::size_t tail_size = length - (kRegions * region_size); const std::size_t tail_size = length - (kRegions * region_size);
// Holding registers for data in each region. // Holding registers for data in each region.
std::array<V128, vec_regions> vec_data; std::array<__m128i, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data; std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
// Main loop. // Main loop.
...@@ -246,10 +245,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -246,10 +245,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Prefetch kPrefetchAhead bytes ahead of each pointer. // Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) { for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
// TODO(b/297082454): investigate dropping prefetch on x86.
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
#endif
} }
// Load and store data, computing CRC on the way. // Load and store data, computing CRC on the way.
...@@ -262,20 +258,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -262,20 +258,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
size_t region = (j + i) % kRegions; size_t region = (j + i) % kRegions;
auto* vsrc = auto* vsrc =
reinterpret_cast<const V128u*>(src_bytes + region_size * region); reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region); auto* vdst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data. // Load and CRC data.
vec_data[j] = V128_LoadU(vsrc + i); vec_data[j] = _mm_loadu_si128(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
CRC32_u64(static_cast<uint32_t>(crcs[region]), static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))}; static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>( crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
CRC32_u64(static_cast<uint32_t>(crcs[region]), static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))}; static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
// Store the data. // Store the data.
V128_Store(vdst + i, vec_data[j]); _mm_store_si128(vdst + i, vec_data[j]);
} }
// Preload the partial CRCs for the CLMUL subregions. // Preload the partial CRCs for the CLMUL subregions.
...@@ -295,7 +292,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -295,7 +292,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Load and CRC the data. // Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
// Store the data. // Store the data.
...@@ -446,5 +443,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, ...@@ -446,5 +443,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END ABSL_NAMESPACE_END
} // namespace absl } // namespace absl
#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE || #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
// ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment