Commit 75d25251 by Martijn Vels Committed by Copybara-Service

Replace absl::base_internal::Prefetch* calls with absl::Prefetch* calls

PiperOrigin-RevId: 505184961
Change-Id: I64482558a76abda6896bec4b2d323833b6cd7edf
parent 8a0693b2
......@@ -738,7 +738,10 @@ cc_library(
],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
deps = [":config"],
deps = [
":config",
":core_headers", # TODO(b/265984188): remove
],
)
cc_test(
......
......@@ -657,6 +657,7 @@ absl_cc_library(
${ABSL_DEFAULT_LINKOPTS}
DEPS
absl::config
absl::core_headers # TODO(b/265984188): remove
)
absl_cc_test(
......
......@@ -12,10 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// TODO(b/265984188): remove all uses and delete this header.
#ifndef ABSL_BASE_INTERNAL_PREFETCH_H_
#define ABSL_BASE_INTERNAL_PREFETCH_H_
#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/prefetch.h"
#ifdef __SSE__
#include <xmmintrin.h>
......@@ -72,10 +76,21 @@ namespace absl {
ABSL_NAMESPACE_BEGIN
namespace base_internal {
void PrefetchT0(const void* addr);
ABSL_DEPRECATED("Use absl::PrefetchToLocalCache() instead")
inline void PrefetchT0(const void* address) {
absl::PrefetchToLocalCache(address);
}
ABSL_DEPRECATED("Use absl::PrefetchToLocalCache() instead")
inline void PrefetchNta(const void* address) {
absl::PrefetchToLocalCacheNta(address);
}
ABSL_DEPRECATED("Use __builtin_prefetch() for advanced prefetch logic instead")
void PrefetchT1(const void* addr);
ABSL_DEPRECATED("Use __builtin_prefetch() for advanced prefetch logic instead")
void PrefetchT2(const void* addr);
void PrefetchNta(const void* addr);
// Implementation details follow.
......@@ -90,10 +105,6 @@ void PrefetchNta(const void* addr);
// safe for all currently supported platforms. However, prefetch for
// store may have problems depending on the target platform.
//
inline void PrefetchT0(const void* addr) {
// Note: this uses prefetcht0 on Intel.
__builtin_prefetch(addr, 0, 3);
}
inline void PrefetchT1(const void* addr) {
// Note: this uses prefetcht1 on Intel.
__builtin_prefetch(addr, 0, 2);
......@@ -102,33 +113,21 @@ inline void PrefetchT2(const void* addr) {
// Note: this uses prefetcht2 on Intel.
__builtin_prefetch(addr, 0, 1);
}
inline void PrefetchNta(const void* addr) {
// Note: this uses prefetchtnta on Intel.
__builtin_prefetch(addr, 0, 0);
}
#elif defined(ABSL_INTERNAL_HAVE_SSE)
#define ABSL_INTERNAL_HAVE_PREFETCH 1
inline void PrefetchT0(const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
}
inline void PrefetchT1(const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T1);
}
inline void PrefetchT2(const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T2);
}
inline void PrefetchNta(const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
}
#else
inline void PrefetchT0(const void*) {}
inline void PrefetchT1(const void*) {}
inline void PrefetchT2(const void*) {}
inline void PrefetchNta(const void*) {}
#endif
} // namespace base_internal
......
......@@ -30,9 +30,11 @@
#include <xmmintrin.h>
#endif
#if defined(_MSC_VER) && defined(ABSL_INTERNAL_HAVE_SSE)
#if defined(_MSC_VER) && _MSC_VER >= 1900 && \
(defined(_M_X64) || defined(_M_IX86))
#include <intrin.h>
#pragma intrinsic(_mm_prefetch)
#pragma intrinsic(_m_prefetchw)
#endif
namespace absl {
......@@ -174,10 +176,15 @@ inline void PrefetchToLocalCacheNta(const void* addr) {
inline void PrefetchToLocalCacheForWrite(const void* addr) {
#if defined(_MM_HINT_ET0)
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
#elif defined(__x86_64__)
#elif defined(_MSC_VER) && _MSC_VER >= 1900 && \
(defined(_M_X64) || defined(_M_IX86))
// MSVC 2015 and up on x86/x64 supports prefetchw (feature listed as 3DNOW)
_m_prefetchw(const_cast<void*>(addr));
#elif !defined(_MSC_VER) && defined(__x86_64__)
// _MM_HINT_ET0 is not universally supported. As we commented further
// up, PREFETCHW is recognized as a no-op on older Intel processors
// and has been present on AMD processors since the K6-2
// and has been present on AMD processors since the K6-2. We have this
// disabled for MSVC compilers as this miscompiles on older MSVC compilers.
asm("prefetchw (%0)" : : "r"(addr));
#endif
}
......
......@@ -185,10 +185,10 @@
#include "absl/base/config.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/internal/raw_logging.h"
#include "absl/base/optimization.h"
#include "absl/base/port.h"
#include "absl/base/prefetch.h"
#include "absl/container/internal/common.h"
#include "absl/container/internal/compressed_tuple.h"
#include "absl/container/internal/container_memory.h"
......@@ -2117,12 +2117,12 @@ class raw_hash_set {
void prefetch(const key_arg<K>& key) const {
(void)key;
// Avoid probing if we won't be able to prefetch the addresses received.
#ifdef ABSL_INTERNAL_HAVE_PREFETCH
#ifdef ABSL_HAVE_PREFETCH
prefetch_heap_block();
auto seq = probe(common(), hash_ref()(key));
base_internal::PrefetchT0(control() + seq.offset());
base_internal::PrefetchT0(slot_array() + seq.offset());
#endif // ABSL_INTERNAL_HAVE_PREFETCH
PrefetchToLocalCache(control() + seq.offset());
PrefetchToLocalCache(slot_array() + seq.offset());
#endif // ABSL_HAVE_PREFETCH
}
// The API of find() has two extensions.
......@@ -2529,10 +2529,14 @@ class raw_hash_set {
// See `CapacityToGrowth()`.
size_t& growth_left() { return common().growth_left(); }
// Prefetch the heap-allocated memory region to resolve potential TLB misses.
// This is intended to overlap with execution of calculating the hash for a
// key.
void prefetch_heap_block() const { base_internal::PrefetchT2(control()); }
// Prefetch the heap-allocated memory region to resolve potential TLB and
// cache misses. This is intended to overlap with execution of calculating the
// hash for a key.
void prefetch_heap_block() const {
#if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
__builtin_prefetch(control(), 0, 1);
#endif
}
CommonFields& common() { return settings_.template get<0>(); }
const CommonFields& common() const { return settings_.template get<0>(); }
......
......@@ -40,8 +40,8 @@
#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/internal/cycleclock.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/internal/raw_logging.h"
#include "absl/base/prefetch.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/container/internal/container_memory.h"
......
......@@ -44,8 +44,8 @@
#include <cstdint>
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/internal/raw_logging.h"
#include "absl/base/prefetch.h"
#include "absl/crc/internal/crc_internal.h"
namespace absl {
......@@ -309,7 +309,7 @@ void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
// Process kStride interleaved swaths through the data in parallel.
while ((e - p) > kPrefetchHorizon) {
base_internal::PrefetchNta(
PrefetchToLocalCacheNta(
reinterpret_cast<const void*>(p + kPrefetchHorizon));
// Process 64 bytes at a time
step_stride();
......
......@@ -52,8 +52,8 @@
#include <type_traits>
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/optimization.h"
#include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc_memcpy.h"
......@@ -242,10 +242,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i);
absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
region_size * i);
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
}
// Load and store data, computing CRC on the way.
......
......@@ -21,7 +21,7 @@
#include "absl/base/config.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/prefetch.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
......@@ -429,11 +429,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
base_internal::PrefetchT0(
PrefetchToLocalCache(
reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
PrefetchToLocalCache(
reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
PrefetchToLocalCache(
reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
}
// Don't run crc on last 8 bytes.
......@@ -517,12 +517,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
for (size_t i = 1; i < bs; i++) {
// Prefetch data for next itterations.
for (size_t j = 0; j < num_crc_streams; j++) {
base_internal::PrefetchT0(
PrefetchToLocalCache(
reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
}
for (size_t j = 0; j < num_pclmul_streams; j++) {
base_internal::PrefetchT0(reinterpret_cast<const char*>(
pclmul_streams[j] + kPrefetchHorizon));
PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
kPrefetchHorizon));
}
// We process each stream in 64 byte blocks. This can be written as
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment