Commit c2e9ce1d by Derek Mauro Committed by Copybara-Service

CRC: Get CPU detection and hardware acceleration working on MSVC x86(_64)

Using /arch:AVX on MSVC now uses the accelerated implementation

PiperOrigin-RevId: 490550573
Change-Id: I924259845f38ee41d15f23f95ad085ad664642b5
parent 4c5eb49d
...@@ -24,30 +24,29 @@ ...@@ -24,30 +24,29 @@
#include <sys/auxv.h> #include <sys/auxv.h>
#endif #endif
#if defined(_WIN32) || defined(_WIN64)
#include <intrin.h>
#endif
namespace absl { namespace absl {
ABSL_NAMESPACE_BEGIN ABSL_NAMESPACE_BEGIN
namespace crc_internal { namespace crc_internal {
#if defined(__x86_64__) #if defined(__x86_64__) || defined(_M_X64)
// Inline cpuid instruction. %rbx is occasionally used to address stack
// variables in presence of dynamic allocas. Preserve the %rbx register via
// %rdi to work around a clang bug https://bugs.llvm.org/show_bug.cgi?id=17907
// (%rbx in an output constraint is not considered a clobbered register).
//
// a_inp and c_inp are the input parameters eax and ecx of the CPUID
// instruction.
// a, b, c, and d contain the contents of eax, ebx, ecx, and edx as returned by
// the CPUID instruction
#define ABSL_INTERNAL_GETCPUID(a, b, c, d, a_inp, c_inp) \
asm("mov %%rbx, %%rdi\n" \
"cpuid\n" \
"xchg %%rdi, %%rbx\n" \
: "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
: "a"(a_inp), "2"(c_inp))
namespace { namespace {
#if !defined(_WIN32) && !defined(_WIN64)
// MSVC defines this function for us.
// https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
static void __cpuid(int cpu_info[4], int info_type) {
__asm__ volatile("cpuid \n\t"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
"=d"(cpu_info[3])
: "a"(info_type), "c"(0));
}
#endif // !defined(_WIN32) && !defined(_WIN64)
enum class Vendor { enum class Vendor {
kUnknown, kUnknown,
kIntel, kIntel,
...@@ -55,14 +54,14 @@ enum class Vendor { ...@@ -55,14 +54,14 @@ enum class Vendor {
}; };
Vendor GetVendor() { Vendor GetVendor() {
uint32_t eax, ebx, ecx, edx; // Get the vendor string (issue CPUID with eax = 0).
int cpu_info[4];
__cpuid(cpu_info, 0);
// Get vendor string (issue CPUID with eax = 0)
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 0, 0);
std::string vendor; std::string vendor;
vendor.append(reinterpret_cast<char*>(&ebx), 4); vendor.append(reinterpret_cast<char*>(&cpu_info[1]), 4);
vendor.append(reinterpret_cast<char*>(&edx), 4); vendor.append(reinterpret_cast<char*>(&cpu_info[3]), 4);
vendor.append(reinterpret_cast<char*>(&ecx), 4); vendor.append(reinterpret_cast<char*>(&cpu_info[2]), 4);
if (vendor == "GenuineIntel") { if (vendor == "GenuineIntel") {
return Vendor::kIntel; return Vendor::kIntel;
} else if (vendor == "AuthenticAmd") { } else if (vendor == "AuthenticAmd") {
...@@ -73,13 +72,14 @@ Vendor GetVendor() { ...@@ -73,13 +72,14 @@ Vendor GetVendor() {
} }
CpuType GetIntelCpuType() { CpuType GetIntelCpuType() {
uint32_t eax, ebx, ecx, edx; // To get general information and extended features we send eax = 1 and
// to get general information and extended features we send eax = 1 and
// ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx. // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
// (See Intel 64 and IA-32 Architectures Software Developer's Manual // (See Intel 64 and IA-32 Architectures Software Developer's Manual
// Volume 2A: Instruction Set Reference, A-M CPUID). // Volume 2A: Instruction Set Reference, A-M CPUID).
// https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html // https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0); // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
int cpu_info[4];
__cpuid(cpu_info, 1);
// Response in eax bits as follows: // Response in eax bits as follows:
// 0-3 (stepping id) // 0-3 (stepping id)
...@@ -89,12 +89,12 @@ CpuType GetIntelCpuType() { ...@@ -89,12 +89,12 @@ CpuType GetIntelCpuType() {
// 16-19 (extended model) // 16-19 (extended model)
// 20-27 (extended family) // 20-27 (extended family)
int family = (eax >> 8) & 0x0f; int family = (cpu_info[0] >> 8) & 0x0f;
int model_num = (eax >> 4) & 0x0f; int model_num = (cpu_info[0] >> 4) & 0x0f;
int ext_family = (eax >> 20) & 0xff; int ext_family = (cpu_info[0] >> 20) & 0xff;
int ext_model_num = (eax >> 16) & 0x0f; int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
int brand_id = ebx & 0xff; int brand_id = cpu_info[1] & 0xff;
// Process the extended family and model info if necessary // Process the extended family and model info if necessary
if (family == 0x0f) { if (family == 0x0f) {
...@@ -123,7 +123,7 @@ CpuType GetIntelCpuType() { ...@@ -123,7 +123,7 @@ CpuType GetIntelCpuType() {
case 0x56: // BroadwellDE case 0x56: // BroadwellDE
return CpuType::kIntelBroadwell; return CpuType::kIntelBroadwell;
case 0x55: // Skylake Xeon case 0x55: // Skylake Xeon
if ((eax & 0x0f) < 5) { // stepping < 5 is skylake if ((cpu_info[0] & 0x0f) < 5) { // stepping < 5 is skylake
return CpuType::kIntelSkylakeXeon; return CpuType::kIntelSkylakeXeon;
} else { // stepping >= 5 is cascadelake } else { // stepping >= 5 is cascadelake
return CpuType::kIntelCascadelakeXeon; return CpuType::kIntelCascadelakeXeon;
...@@ -142,12 +142,13 @@ CpuType GetIntelCpuType() { ...@@ -142,12 +142,13 @@ CpuType GetIntelCpuType() {
} }
CpuType GetAmdCpuType() { CpuType GetAmdCpuType() {
uint32_t eax, ebx, ecx, edx; // To get general information and extended features we send eax = 1 and
// to get general information and extended features we send eax = 1 and
// ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx. // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
// (See Intel 64 and IA-32 Architectures Software Developer's Manual // (See Intel 64 and IA-32 Architectures Software Developer's Manual
// Volume 2A: Instruction Set Reference, A-M CPUID). // Volume 2A: Instruction Set Reference, A-M CPUID).
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0); // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
int cpu_info[4];
__cpuid(cpu_info, 1);
// Response in eax bits as follows: // Response in eax bits as follows:
// 0-3 (stepping id) // 0-3 (stepping id)
...@@ -157,10 +158,10 @@ CpuType GetAmdCpuType() { ...@@ -157,10 +158,10 @@ CpuType GetAmdCpuType() {
// 16-19 (extended model) // 16-19 (extended model)
// 20-27 (extended family) // 20-27 (extended family)
int family = (eax >> 8) & 0x0f; int family = (cpu_info[0] >> 8) & 0x0f;
int model_num = (eax >> 4) & 0x0f; int model_num = (cpu_info[0] >> 4) & 0x0f;
int ext_family = (eax >> 20) & 0xff; int ext_family = (cpu_info[0] >> 20) & 0xff;
int ext_model_num = (eax >> 16) & 0x0f; int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
if (family == 0x0f) { if (family == 0x0f) {
family += ext_family; family += ext_family;
......
...@@ -25,12 +25,18 @@ ...@@ -25,12 +25,18 @@
// We define a translation layer for both x86 and ARM for the ease of use and // We define a translation layer for both x86 and ARM for the ease of use and
// most performance gains. // most performance gains.
// We need CRC (part of sse4.2) and PCLMULQDQ instructions. // We need CRC (part of SSE 4.2) and PCLMULQDQ instructions.
#if defined(__SSE4_2__) && defined(__PCLMUL__) #if defined(__SSE4_2__) && defined(__PCLMUL__)
#include <x86intrin.h> #include <x86intrin.h>
#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
#elif defined(_MSC_VER) && defined(__AVX__)
// MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
#include <intrin.h>
#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
#elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \ #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON) defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON)
......
...@@ -38,24 +38,27 @@ ...@@ -38,24 +38,27 @@
// using 3 CRCs over fixed-size blocks where the zero-extensions required for // using 3 CRCs over fixed-size blocks where the zero-extensions required for
// CRC32C::Concat can be precomputed. // CRC32C::Concat can be precomputed.
#include <cstddef>
#include <cstdint>
#include "absl/crc/crc32c.h"
#include "absl/strings/string_view.h"
#ifdef __SSE4_2__ #ifdef __SSE4_2__
#include <immintrin.h>
#endif
#include <emmintrin.h> #ifdef _MSC_VER
#include <x86intrin.h> #include <intrin.h>
#endif
#include <cstddef>
#include <cstdint>
#include <type_traits> #include <type_traits>
#include "absl/base/dynamic_annotations.h" #include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/prefetch.h" #include "absl/base/internal/prefetch.h"
#include "absl/base/optimization.h" #include "absl/base/optimization.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h" #include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc_memcpy.h" #include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__))
namespace absl { namespace absl {
ABSL_NAMESPACE_BEGIN ABSL_NAMESPACE_BEGIN
...@@ -88,7 +91,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -88,7 +91,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
uint64_t int_data[kIntLoadsPerVec * int_regions]; uint64_t int_data[kIntLoadsPerVec * int_regions];
while (copy_rounds > 0) { while (copy_rounds > 0) {
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int i = 0; i < vec_regions; i++) { for (int i = 0; i < vec_regions; i++) {
int region = i; int region = i;
...@@ -109,7 +114,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -109,7 +114,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
_mm_extract_epi64(data[i], 1))); _mm_extract_epi64(data[i], 1)));
} }
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int i = 0; i < int_regions; i++) { for (int i = 0; i < int_regions; i++) {
int region = vec_regions + i; int region = vec_regions + i;
...@@ -117,7 +124,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, ...@@ -117,7 +124,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
reinterpret_cast<const uint64_t*>(*src + region_size * region); reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region); auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int j = 0; j < kIntLoadsPerVec; j++) { for (int j = 0; j < kIntLoadsPerVec; j++) {
int data_index = i * kIntLoadsPerVec + j; int data_index = i * kIntLoadsPerVec + j;
...@@ -238,7 +247,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -238,7 +247,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Main loop. // Main loop.
while (copy_rounds > kBlocksPerCacheLine) { while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer. // Prefetch kPrefetchAhead bytes ahead of each pointer.
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int i = 0; i < kRegions; i++) { for (int i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead + absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i); region_size * i);
...@@ -247,10 +258,14 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -247,10 +258,14 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
} }
// Load and store data, computing CRC on the way. // Load and store data, computing CRC on the way.
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int i = 0; i < kBlocksPerCacheLine; i++) { for (int i = 0; i < kBlocksPerCacheLine; i++) {
// Copy and CRC the data for the CRC regions. // Copy and CRC the data for the CRC regions.
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int j = 0; j < vec_regions; j++) { for (int j = 0; j < vec_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to // Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue // engage prefetching logic around vector load/stores and save issue
...@@ -276,7 +291,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -276,7 +291,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
} }
// Preload the partial CRCs for the CLMUL subregions. // Preload the partial CRCs for the CLMUL subregions.
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int j = 0; j < int_regions; j++) { for (int j = 0; j < int_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to // Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue // engage prefetching logic around vector load/stores and save issue
...@@ -288,7 +305,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( ...@@ -288,7 +305,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
auto* udst = auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region); reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
#ifdef __GNUC__
#pragma unroll_completely #pragma unroll_completely
#endif
for (int k = 0; k < kIntLoadsPerVec; k++) { for (int k = 0; k < kIntLoadsPerVec; k++) {
int data_index = j * kIntLoadsPerVec + k; int data_index = j * kIntLoadsPerVec + k;
...@@ -432,4 +451,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, ...@@ -432,4 +451,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END ABSL_NAMESPACE_END
} // namespace absl } // namespace absl
#endif // __SSE4_2__ #endif // defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__))
...@@ -35,6 +35,9 @@ ...@@ -35,6 +35,9 @@
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#elif defined(__SSE4_2__) && defined(__PCLMUL__) #elif defined(__SSE4_2__) && defined(__PCLMUL__)
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#elif defined(_MSC_VER) && defined(__AVX__)
// MSVC AVX support (/arch:AVX) implies SSE 4.2 and PCLMUL support.
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#endif #endif
namespace absl { namespace absl {
......
...@@ -15,46 +15,56 @@ ...@@ -15,46 +15,56 @@
#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
#define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
#include <algorithm> #ifdef _MSC_VER
#include <cassert> #include <intrin.h>
#include <cstring> #endif
#include <iostream>
#include "absl/base/config.h"
#include "absl/base/optimization.h"
#ifdef __SSE__ #ifdef __SSE__
// Only include if we're running on a CPU that supports SSE ISA, needed for #include <xmmintrin.h>
// sfence
#include <immintrin.h> // IWYU pragma: keep
#endif #endif
#ifdef __SSE2__ #ifdef __SSE2__
// Only include if we're running on a CPU that supports SSE2 ISA, needed for #include <emmintrin.h>
// movdqa, movdqu, movntdq #endif
#include <emmintrin.h> // IWYU pragma: keep
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif #endif
#ifdef __aarch64__ #ifdef __aarch64__
// Only include if we're running on a CPU that supports ARM NEON ISA, needed for
// sfence, movdqa, movdqu, movntdq
#include "absl/crc/internal/non_temporal_arm_intrinsics.h" #include "absl/crc/internal/non_temporal_arm_intrinsics.h"
#endif #endif
#include <algorithm>
#include <cassert>
#include <cstring>
#include <iostream>
#include "absl/base/config.h"
#include "absl/base/optimization.h"
namespace absl { namespace absl {
ABSL_NAMESPACE_BEGIN ABSL_NAMESPACE_BEGIN
namespace crc_internal { namespace crc_internal {
// This non-temporal memcpy does regular load and non-temporal store memory // This non-temporal memcpy does regular load and non-temporal store memory
// copy. It is compatible to both 16-byte aligned and unaligned addresses. If // copy. It is compatible to both 16-byte aligned and unaligned addresses. If
// data at the destination is not immediately accessed, using non-temporal // data at the destination is not immediately accessed, using non-temporal
// memcpy can save 1 DRAM load of the destination cacheline. // memcpy can save 1 DRAM load of the destination cacheline.
constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
constexpr int kCacheLineSize = ABSL_CACHELINE_SIZE;
// If the objects overlap, the behavior is undefined. // If the objects overlap, the behavior is undefined.
// MSVC does not have proper header support for some of these intrinsics,
// so it should go to fallback
inline void *non_temporal_store_memcpy(void *__restrict dst, inline void *non_temporal_store_memcpy(void *__restrict dst,
const void *__restrict src, size_t len) { const void *__restrict src, size_t len) {
#if (defined(__SSE3__) || defined(__aarch64__)) && !defined(_MSC_VER) #if defined(__SSE3__) || defined(__aarch64__) || \
(defined(_MSC_VER) && defined(__AVX__))
// This implementation requires SSE3.
// MSVC cannot target SSE3 directly, but when MSVC targets AVX,
// SSE3 support is implied.
uint8_t *d = reinterpret_cast<uint8_t *>(dst); uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src); const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
...@@ -104,17 +114,15 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, ...@@ -104,17 +114,15 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
} }
return dst; return dst;
#else #else
// Fallback to regular memcpy when SSE2/3 & aarch64 is not available. // Fallback to regular memcpy.
return memcpy(dst, src, len); return memcpy(dst, src, len);
#endif // __SSE3__ || __aarch64__ #endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
} }
// MSVC does not have proper header support for some of these intrinsics,
// so it should go to fallback
inline void *non_temporal_store_memcpy_avx(void *__restrict dst, inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
const void *__restrict src, const void *__restrict src,
size_t len) { size_t len) {
#if defined(__AVX__) && !defined(_MSC_VER) #ifdef __AVX__
uint8_t *d = reinterpret_cast<uint8_t *>(dst); uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src); const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment