Commit 18018aa4 by Abseil Team Committed by Copybara-Service

Adjust conditonal compilation in non_temporal_memcpy.h

This change will allow the AVX version of non-temporal memcpy to be compiled
even if the compiler isn't run with AVX support. This allows runtime dispatch
to select the AVX implementation for CPUs that are known to be compatible with
AVX instructions.

PiperOrigin-RevId: 619594422
Change-Id: Ia7d92404ef8d10d152030b29b71948ed954f28f5
parent a1ced69b
......@@ -19,19 +19,8 @@
#include <intrin.h>
#endif
#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#ifdef __AVX__
#if defined(__SSE__) || defined(__AVX__)
// Pulls in both SSE and AVX intrinsics.
#include <immintrin.h>
#endif
......@@ -44,6 +33,7 @@
#include <cstdint>
#include <cstring>
#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/optimization.h"
......@@ -57,7 +47,9 @@ namespace crc_internal {
// memcpy can save 1 DRAM load of the destination cacheline.
constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
// If the objects overlap, the behavior is undefined.
// If the objects overlap, the behavior is undefined. Uses regular memcpy
// instead of non-temporal memcpy if the required CPU intrinsics are unavailable
// at compile time.
inline void *non_temporal_store_memcpy(void *__restrict dst,
const void *__restrict src, size_t len) {
#if defined(__SSE3__) || defined(__aarch64__) || \
......@@ -119,10 +111,20 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
#endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
}
// If the objects overlap, the behavior is undefined. Uses regular memcpy
// instead of non-temporal memcpy if the required CPU intrinsics are unavailable
// at compile time.
#if ABSL_HAVE_CPP_ATTRIBUTE(gnu::target) && \
(defined(__x86_64__) || defined(__i386__))
[[gnu::target("avx")]]
#endif
inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
const void *__restrict src,
size_t len) {
#ifdef __AVX__
// This function requires AVX. For clang and gcc we compile it with AVX even
// if the translation unit isn't built with AVX support. This works because we
// only select this implementation at runtime if the CPU supports AVX.
#if defined(__SSE3__) || (defined(_MSC_VER) && defined(__AVX__))
uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
......@@ -168,9 +170,8 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
}
return dst;
#else
// Fallback to regular memcpy when AVX is not available.
return memcpy(dst, src, len);
#endif // __AVX__
#endif // __SSE3__ || (_MSC_VER && __AVX__)
}
} // namespace crc_internal
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment