123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- // Copyright 2023 The Abseil Authors
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // https://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- // -----------------------------------------------------------------------------
- // File: prefetch.h
- // -----------------------------------------------------------------------------
- //
- // This header file defines prefetch functions to prefetch memory contents
- // into the first level cache (L1) for the current CPU. The prefetch logic
- // offered in this header is limited to prefetching first level cachelines
- // only, and is aimed at relatively 'simple' prefetching logic.
- //
- #ifndef Y_ABSL_BASE_PREFETCH_H_
- #define Y_ABSL_BASE_PREFETCH_H_
- #include "y_absl/base/config.h"
- #if defined(Y_ABSL_INTERNAL_HAVE_SSE)
- #include <xmmintrin.h>
- #endif
- #if defined(_MSC_VER) && _MSC_VER >= 1900 && \
- (defined(_M_X64) || defined(_M_IX86))
- #include <intrin.h>
- #pragma intrinsic(_mm_prefetch)
- #endif
- namespace y_absl {
- Y_ABSL_NAMESPACE_BEGIN
- // Moves data into the L1 cache before it is read, or "prefetches" it.
- //
- // The value of `addr` is the address of the memory to prefetch. If
- // the target and compiler support it, data prefetch instructions are
- // generated. If the prefetch is done some time before the memory is
- // read, it may be in the cache by the time the read occurs.
- //
- // This method prefetches data with the highest degree of temporal locality;
- // data is prefetched where possible into all levels of the cache.
- //
- // Incorrect or gratuitous use of this function can degrade performance.
- // Use this function only when representative benchmarks show an improvement.
- //
- // Example:
- //
- // // Computes incremental checksum for `data`.
- // int ComputeChecksum(int sum, y_absl::string_view data);
- //
- // // Computes cumulative checksum for all values in `data`
- // int ComputeChecksum(y_absl::Span<const TString> data) {
- // int sum = 0;
- // auto it = data.begin();
- // auto pit = data.begin();
- // auto end = data.end();
- // for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) {
- // y_absl::PrefetchToLocalCache(pit->data());
- // }
- // for (; pit != end; ++pit, ++it) {
- // sum = ComputeChecksum(sum, *it);
- // y_absl::PrefetchToLocalCache(pit->data());
- // }
- // for (; it != end; ++it) {
- // sum = ComputeChecksum(sum, *it);
- // }
- // return sum;
- // }
- //
- void PrefetchToLocalCache(const void* addr);
- // Moves data into the L1 cache before it is read, or "prefetches" it.
- //
- // This function is identical to `PrefetchToLocalCache()` except that it has
- // non-temporal locality: the fetched data should not be left in any of the
- // cache tiers. This is useful for cases where the data is used only once /
- // short term, for example, invoking a destructor on an object.
- //
- // Incorrect or gratuitous use of this function can degrade performance.
- // Use this function only when representative benchmarks show an improvement.
- //
- // Example:
- //
- // template <typename Iterator>
- // void DestroyPointers(Iterator begin, Iterator end) {
- // size_t distance = std::min(8U, bars.size());
- //
- // int dist = 8;
- // auto prefetch_it = begin;
- // while (prefetch_it != end && --dist;) {
- // y_absl::PrefetchToLocalCacheNta(*prefetch_it++);
- // }
- // while (prefetch_it != end) {
- // delete *begin++;
- // y_absl::PrefetchToLocalCacheNta(*prefetch_it++);
- // }
- // while (begin != end) {
- // delete *begin++;
- // }
- // }
- //
- void PrefetchToLocalCacheNta(const void* addr);
- // Moves data into the L1 cache with the intent to modify it.
- //
- // This function is similar to `PrefetchToLocalCache()` except that it
- // prefetches cachelines with an 'intent to modify' This typically includes
- // invalidating cache entries for this address in all other cache tiers, and an
- // exclusive access intent.
- //
- // Incorrect or gratuitous use of this function can degrade performance. As this
- // function can invalidate cached cachelines on other caches and computer cores,
- // incorrect usage of this function can have an even greater negative impact
- // than incorrect regular prefetches.
- // Use this function only when representative benchmarks show an improvement.
- //
- // Example:
- //
- // void* Arena::Allocate(size_t size) {
- // void* ptr = AllocateBlock(size);
- // y_absl::PrefetchToLocalCacheForWrite(p);
- // return ptr;
- // }
- //
- void PrefetchToLocalCacheForWrite(const void* addr);
- #if Y_ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
- #define Y_ABSL_HAVE_PREFETCH 1
- // See __builtin_prefetch:
- // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
- //
- inline void PrefetchToLocalCache(const void* addr) {
- __builtin_prefetch(addr, 0, 3);
- }
- inline void PrefetchToLocalCacheNta(const void* addr) {
- __builtin_prefetch(addr, 0, 0);
- }
- inline void PrefetchToLocalCacheForWrite(const void* addr) {
- // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1)
- // unless -march=broadwell or newer; this is not generally the default, so we
- // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel
- // processors and has been present on AMD processors since the K6-2.
- #if defined(__x86_64__)
- asm("prefetchw (%0)" : : "r"(addr));
- #else
- __builtin_prefetch(addr, 1, 3);
- #endif
- }
- #elif defined(Y_ABSL_INTERNAL_HAVE_SSE)
- #define Y_ABSL_HAVE_PREFETCH 1
- inline void PrefetchToLocalCache(const void* addr) {
- _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
- }
- inline void PrefetchToLocalCacheNta(const void* addr) {
- _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
- }
- inline void PrefetchToLocalCacheForWrite(const void* addr) {
- #if defined(_MM_HINT_ET0)
- _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
- #elif !defined(_MSC_VER) && defined(__x86_64__)
- // _MM_HINT_ET0 is not universally supported. As we commented further
- // up, PREFETCHW is recognized as a no-op on older Intel processors
- // and has been present on AMD processors since the K6-2. We have this
- // disabled for MSVC compilers as this miscompiles on older MSVC compilers.
- asm("prefetchw (%0)" : : "r"(addr));
- #endif
- }
- #else
- inline void PrefetchToLocalCache(const void* addr) {}
- inline void PrefetchToLocalCacheNta(const void* addr) {}
- inline void PrefetchToLocalCacheForWrite(const void* addr) {}
- #endif
- Y_ABSL_NAMESPACE_END
- } // namespace y_absl
- #endif // Y_ABSL_BASE_PREFETCH_H_
|