#pragma once // This file contains the core parts of YTAlloc but no malloc/free-bridge. // The latter bridge is placed into alloc.cpp, which includes (sic!) core-inl.h. // This ensures that AllocateInline/FreeInline calls are properly inlined into malloc/free. // Also core-inl.h can be directly included in, e.g., benchmarks. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _linux_ #include #endif #include #include #include #ifndef MAP_POPULATE #define MAP_POPULATE 0x08000 #endif // MAP_FIXED which doesn't unmap underlying mapping. // Linux kernels older than 4.17 silently ignore this flag. #ifndef MAP_FIXED_NOREPLACE #ifdef _linux_ #define MAP_FIXED_NOREPLACE 0x100000 #else #define MAP_FIXED_NOREPLACE 0 #endif #endif #ifndef MADV_POPULATE #define MADV_POPULATE 0x59410003 #endif #ifndef MADV_STOCKPILE #define MADV_STOCKPILE 0x59410004 #endif #ifndef MADV_FREE #define MADV_FREE 8 #endif #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif #ifndef NDEBUG #define YTALLOC_PARANOID #endif #ifdef YTALLOC_PARANOID #define YTALLOC_NERVOUS #endif #define YTALLOC_VERIFY(condition) \ do { \ if (Y_UNLIKELY(!(condition))) { \ ::NYT::NYTAlloc::AssertTrap("Assertion failed: " #condition, __FILE__, __LINE__); \ } \ } while (false) #ifdef NDEBUG #define YTALLOC_ASSERT(condition) YTALLOC_VERIFY(condition) #else #define YTALLOC_ASSERT(condition) (void)(0) #endif #ifdef YTALLOC_PARANOID #define YTALLOC_PARANOID_ASSERT(condition) YTALLOC_VERIFY(condition) #else #define YTALLOC_PARANOID_ASSERT(condition) (true || (condition)) #endif #define YTALLOC_TRAP(message) ::NYT::NYTAlloc::AssertTrap(message, __FILE__, __LINE__) namespace NYT::NYTAlloc { //////////////////////////////////////////////////////////////////////////////// // Allocations are classified into three types: // // a) Small chunks (less than LargeAllocationSizeThreshold) // These are the fastest and are extensively cached (both per-thread and globally). // Memory claimed for these allocations is never reclaimed back. // Code dealing with such allocations is heavy optimized with all hot paths // as streamlined as possible. The implementation is mostly inspired by LFAlloc. // // b) Large blobs (from LargeAllocationSizeThreshold to HugeAllocationSizeThreshold) // These are cached as well. We expect such allocations to be less frequent // than small ones but still do our best to provide good scalability. // In particular, thread-sharded concurrent data structures as used to provide access to // cached blobs. Memory is claimed via madvise(MADV_POPULATE) and reclaimed back // via madvise(MADV_FREE). // // c) Huge blobs (from HugeAllocationSizeThreshold) // These should be rare; we delegate directly to mmap and munmap for each allocation. // // We also provide a separate allocator for all system allocations (that are needed by YTAlloc itself). // These are rare and also delegate to mmap/unmap. // Periods between background activities. constexpr auto BackgroundInterval = TDuration::Seconds(1); static_assert(LargeRankCount - MinLargeRank <= 16, "Too many large ranks"); static_assert(SmallRankCount <= 32, "Too many small ranks"); constexpr size_t SmallZoneSize = 1_TB; constexpr size_t LargeZoneSize = 16_TB; constexpr size_t HugeZoneSize = 1_TB; constexpr size_t SystemZoneSize = 1_TB; constexpr size_t MaxCachedChunksPerRank = 256; constexpr uintptr_t UntaggedSmallZonesStart = 0; constexpr uintptr_t UntaggedSmallZonesEnd = UntaggedSmallZonesStart + 32 * SmallZoneSize; constexpr uintptr_t MinUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * 1; constexpr uintptr_t MaxUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * SmallRankCount; constexpr uintptr_t TaggedSmallZonesStart = UntaggedSmallZonesEnd; constexpr uintptr_t TaggedSmallZonesEnd = TaggedSmallZonesStart + 32 * SmallZoneSize; constexpr uintptr_t MinTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * 1; constexpr uintptr_t MaxTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * SmallRankCount; constexpr uintptr_t DumpableLargeZoneStart = TaggedSmallZonesEnd; constexpr uintptr_t DumpableLargeZoneEnd = DumpableLargeZoneStart + LargeZoneSize; constexpr uintptr_t UndumpableLargeZoneStart = DumpableLargeZoneEnd; constexpr uintptr_t UndumpableLargeZoneEnd = UndumpableLargeZoneStart + LargeZoneSize; constexpr uintptr_t LargeZoneStart(bool dumpable) { return dumpable ? DumpableLargeZoneStart : UndumpableLargeZoneStart; } constexpr uintptr_t LargeZoneEnd(bool dumpable) { return dumpable ? DumpableLargeZoneEnd : UndumpableLargeZoneEnd; } constexpr uintptr_t HugeZoneStart = UndumpableLargeZoneEnd; constexpr uintptr_t HugeZoneEnd = HugeZoneStart + HugeZoneSize; constexpr uintptr_t SystemZoneStart = HugeZoneEnd; constexpr uintptr_t SystemZoneEnd = SystemZoneStart + SystemZoneSize; // We leave 64_KB at the end of 256_MB block and never use it. // That serves two purposes: // 1. SmallExtentSize % SmallSegmentSize == 0 // 2. Every small object satisfies RightReadableArea requirement. constexpr size_t SmallExtentAllocSize = 256_MB; constexpr size_t SmallExtentSize = SmallExtentAllocSize - 64_KB; constexpr size_t SmallSegmentSize = 96_KB; // LCM(SmallRankToSize) constexpr ui16 SmallRankBatchSize[SmallRankCount] = { 0, 256, 256, 256, 256, 256, 256, 256, 256, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3 }; constexpr bool CheckSmallSizes() { for (size_t rank = 0; rank < SmallRankCount; rank++) { auto size = SmallRankToSize[rank]; if (size == 0) { continue; } if (SmallSegmentSize % size != 0) { return false; } if (SmallRankBatchSize[rank] > MaxCachedChunksPerRank) { return false; } } return true; } static_assert(CheckSmallSizes()); static_assert(SmallExtentSize % SmallSegmentSize == 0); static_assert(SmallSegmentSize % PageSize == 0); constexpr size_t LargeExtentSize = 1_GB; static_assert(LargeExtentSize >= LargeAllocationSizeThreshold, "LargeExtentSize < LargeAllocationSizeThreshold"); constexpr const char* BackgroundThreadName = "YTAllocBack"; constexpr const char* StockpileThreadName = "YTAllocStock"; DEFINE_ENUM(EAllocationKind, (Untagged) (Tagged) ); // Forward declarations. struct TThreadState; struct TLargeArena; struct TLargeBlobExtent; //////////////////////////////////////////////////////////////////////////////// // Traps and assertions [[noreturn]] void OomTrap() { _exit(9); } [[noreturn]] void AssertTrap(const char* message, const char* file, int line) { ::fprintf(stderr, "*** YTAlloc has detected an internal trap at %s:%d\n*** %s\n", file, line, message); __builtin_trap(); } template void AssertBlobState(T* header, E expectedState) { auto actualState = header->State; if (Y_UNLIKELY(actualState != expectedState)) { char message[256]; snprintf(message, sizeof(message), "Invalid blob header state at %p: expected %" PRIx64 ", actual %" PRIx64, header, static_cast(expectedState), static_cast(actualState)); YTALLOC_TRAP(message); } } //////////////////////////////////////////////////////////////////////////////// // Provides a never-dying singleton with explicit construction. template class TExplicitlyConstructableSingleton { public: TExplicitlyConstructableSingleton() { } ~TExplicitlyConstructableSingleton() { } template void Construct(Ts&&... args) { new (&Storage_) T(std::forward(args)...); #ifndef NDEBUG Constructed_ = true; #endif } Y_FORCE_INLINE T* Get() { #ifndef NDEBUG YTALLOC_PARANOID_ASSERT(Constructed_); #endif return &Storage_; } Y_FORCE_INLINE const T* Get() const { #ifndef NDEBUG YTALLOC_PARANOID_ASSERT(Constructed_); #endif return &Storage_; } Y_FORCE_INLINE T* operator->() { return Get(); } Y_FORCE_INLINE const T* operator->() const { return Get(); } Y_FORCE_INLINE T& operator*() { return *Get(); } Y_FORCE_INLINE const T& operator*() const { return *Get(); } private: union { T Storage_; }; #ifndef NDEBUG bool Constructed_; #endif }; //////////////////////////////////////////////////////////////////////////////// // Initializes all singletons. // Safe to call multiple times. // Guaranteed to not allocate. void InitializeGlobals(); // Spawns the background thread, if it's time. // Safe to call multiple times. // Must be called on allocation slow path. void StartBackgroundThread(); //////////////////////////////////////////////////////////////////////////////// class TLogManager { public: // Sets the handler to be invoked for each log event produced by YTAlloc. void EnableLogging(TLogHandler logHandler) { LogHandler_.store(logHandler); } // Checks (in a racy way) that logging is enabled. bool IsLoggingEnabled() { return LogHandler_.load() != nullptr; } // Logs the message via log handler (if any). template void LogMessage(ELogEventSeverity severity, const char* format, Ts&&... args) { auto logHandler = LogHandler_.load(); if (!logHandler) { return; } std::array buffer; auto len = ::snprintf(buffer.data(), buffer.size(), format, std::forward(args)...); TLogEvent event; event.Severity = severity; event.Message = TStringBuf(buffer.data(), len); logHandler(event); } // A special case of zero args. void LogMessage(ELogEventSeverity severity, const char* message) { LogMessage(severity, "%s", message); } private: std::atomic LogHandler_= nullptr; }; TExplicitlyConstructableSingleton LogManager; #define YTALLOC_LOG_EVENT(...) LogManager->LogMessage(__VA_ARGS__) #define YTALLOC_LOG_DEBUG(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Debug, __VA_ARGS__) #define YTALLOC_LOG_INFO(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Info, __VA_ARGS__) #define YTALLOC_LOG_WARNING(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Warning, __VA_ARGS__) #define YTALLOC_LOG_ERROR(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Error, __VA_ARGS__) //////////////////////////////////////////////////////////////////////////////// Y_FORCE_INLINE size_t GetUsed(ssize_t allocated, ssize_t freed) { return allocated >= freed ? static_cast(allocated - freed) : 0; } template Y_FORCE_INLINE void* HeaderToPtr(T* header) { return header + 1; } template Y_FORCE_INLINE T* PtrToHeader(void* ptr) { return static_cast(ptr) - 1; } template Y_FORCE_INLINE const T* PtrToHeader(const void* ptr) { return static_cast(ptr) - 1; } Y_FORCE_INLINE size_t PtrToSmallRank(const void* ptr) { return (reinterpret_cast(ptr) >> 40) & 0x1f; } Y_FORCE_INLINE char* AlignDownToSmallSegment(char* extent, char* ptr) { auto offset = static_cast(ptr - extent); // NB: This modulo operation is always performed using multiplication. offset -= (offset % SmallSegmentSize); return extent + offset; } Y_FORCE_INLINE char* AlignUpToSmallSegment(char* extent, char* ptr) { return AlignDownToSmallSegment(extent, ptr + SmallSegmentSize - 1); } template static Y_FORCE_INLINE void UnalignPtr(void*& ptr) { if (reinterpret_cast(ptr) % PageSize == 0) { reinterpret_cast(ptr) -= PageSize - sizeof (T); } YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) % PageSize == sizeof (T)); } template static Y_FORCE_INLINE void UnalignPtr(const void*& ptr) { if (reinterpret_cast(ptr) % PageSize == 0) { reinterpret_cast(ptr) -= PageSize - sizeof (T); } YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) % PageSize == sizeof (T)); } template Y_FORCE_INLINE size_t GetRawBlobSize(size_t size) { return AlignUp(size + sizeof (T) + RightReadableAreaSize, PageSize); } template Y_FORCE_INLINE size_t GetBlobAllocationSize(size_t size) { size += sizeof(T); size += RightReadableAreaSize; size = AlignUp(size, PageSize); size -= sizeof(T); size -= RightReadableAreaSize; return size; } Y_FORCE_INLINE size_t GetLargeRank(size_t size) { size_t rank = 64 - __builtin_clzl(size); if (size == (1ULL << (rank - 1))) { --rank; } return rank; } Y_FORCE_INLINE void PoisonRange(void* ptr, size_t size, ui32 magic) { #ifdef YTALLOC_PARANOID size = ::AlignUp(size, 4); std::fill(static_cast(ptr), static_cast(ptr) + size / 4, magic); #else Y_UNUSED(ptr); Y_UNUSED(size); Y_UNUSED(magic); #endif } Y_FORCE_INLINE void PoisonFreedRange(void* ptr, size_t size) { PoisonRange(ptr, size, 0xdeadbeef); } Y_FORCE_INLINE void PoisonUninitializedRange(void* ptr, size_t size) { PoisonRange(ptr, size, 0xcafebabe); } // Checks that the header size is divisible by 16 (as needed due to alignment restrictions). #define CHECK_HEADER_ALIGNMENT(T) static_assert(sizeof(T) % 16 == 0, "sizeof(" #T ") % 16 != 0"); //////////////////////////////////////////////////////////////////////////////// static_assert(sizeof(TFreeList) == CacheLineSize, "sizeof(TFreeList) != CacheLineSize"); //////////////////////////////////////////////////////////////////////////////// constexpr size_t ShardCount = 16; std::atomic GlobalCurrentShardIndex; // Provides a context for working with sharded data structures. // Captures the initial shard index upon construction (indicating the shard // where all insertions go). Maintains the current shard index (round-robin, // indicating the shard currently used for extraction). // Can be or be not thread-safe depending on TCounter. template class TShardedState { public: TShardedState() : InitialShardIndex_(GlobalCurrentShardIndex++ % ShardCount) , CurrentShardIndex_(InitialShardIndex_) { } Y_FORCE_INLINE size_t GetInitialShardIndex() const { return InitialShardIndex_; } Y_FORCE_INLINE size_t GetNextShardIndex() { return ++CurrentShardIndex_ % ShardCount; } private: const size_t InitialShardIndex_; TCounter CurrentShardIndex_; }; using TLocalShardedState = TShardedState; using TGlobalShardedState = TShardedState>; // Implemented as a collection of free lists (each called a shard). // One needs TShardedState to access the sharded data structure. template class TShardedFreeList { public: // First tries to extract an item from the initial shard; // if failed then proceeds to all shards in round-robin fashion. template T* Extract(TState* state) { if (auto* item = Shards_[state->GetInitialShardIndex()].Extract()) { return item; } return ExtractRoundRobin(state); } // Attempts to extract an item from all shards in round-robin fashion. template T* ExtractRoundRobin(TState* state) { for (size_t index = 0; index < ShardCount; ++index) { if (auto* item = Shards_[state->GetNextShardIndex()].Extract()) { return item; } } return nullptr; } // Extracts items from all shards linking them together. T* ExtractAll() { T* head = nullptr; T* tail = nullptr; for (auto& shard : Shards_) { auto* item = shard.ExtractAll(); if (!head) { head = item; } if (tail) { YTALLOC_PARANOID_ASSERT(!tail->Next); tail->Next = item; } else { tail = item; } while (tail && tail->Next) { tail = tail->Next; } } return head; } template void Put(TState* state, T* item) { Shards_[state->GetInitialShardIndex()].Put(item); } private: std::array, ShardCount> Shards_; }; //////////////////////////////////////////////////////////////////////////////// // Holds YTAlloc control knobs. // Thread safe. class TConfigurationManager { public: void SetLargeUnreclaimableCoeff(double value) { LargeUnreclaimableCoeff_.store(value); } double GetLargeUnreclaimableCoeff() const { return LargeUnreclaimableCoeff_.load(std::memory_order_relaxed); } void SetMinLargeUnreclaimableBytes(size_t value) { MinLargeUnreclaimableBytes_.store(value); } void SetMaxLargeUnreclaimableBytes(size_t value) { MaxLargeUnreclaimableBytes_.store(value); } size_t GetMinLargeUnreclaimableBytes() const { return MinLargeUnreclaimableBytes_.load(std::memory_order_relaxed); } size_t GetMaxLargeUnreclaimableBytes() const { return MaxLargeUnreclaimableBytes_.load(std::memory_order_relaxed); } void SetTimingEventThreshold(TDuration value) { TimingEventThresholdNs_.store(value.MicroSeconds() * 1000); } i64 GetTimingEventThresholdNs() const { return TimingEventThresholdNs_.load(std::memory_order_relaxed); } void SetAllocationProfilingEnabled(bool value); bool IsAllocationProfilingEnabled() const { return AllocationProfilingEnabled_.load(); } Y_FORCE_INLINE bool GetAllocationProfilingSamplingRate() { return AllocationProfilingSamplingRate_.load(); } void SetAllocationProfilingSamplingRate(double rate) { if (rate < 0) { rate = 0; } if (rate > 1) { rate = 1; } i64 rateX64K = static_cast(rate * (1ULL << 16)); AllocationProfilingSamplingRateX64K_.store(ClampVal(rateX64K, 0, std::numeric_limits::max() + 1)); AllocationProfilingSamplingRate_.store(rate); } Y_FORCE_INLINE bool IsSmallArenaAllocationProfilingEnabled(size_t rank) { return SmallArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed); } Y_FORCE_INLINE bool IsSmallArenaAllocationProfiled(size_t rank) { return IsSmallArenaAllocationProfilingEnabled(rank) && IsAllocationSampled(); } void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value) { if (rank >= SmallRankCount) { return; } SmallArenaAllocationProfilingEnabled_[rank].store(value); } Y_FORCE_INLINE bool IsLargeArenaAllocationProfilingEnabled(size_t rank) { return LargeArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed); } Y_FORCE_INLINE bool IsLargeArenaAllocationProfiled(size_t rank) { return IsLargeArenaAllocationProfilingEnabled(rank) && IsAllocationSampled(); } void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value) { if (rank >= LargeRankCount) { return; } LargeArenaAllocationProfilingEnabled_[rank].store(value); } Y_FORCE_INLINE int GetProfilingBacktraceDepth() { return ProfilingBacktraceDepth_.load(); } void SetProfilingBacktraceDepth(int depth) { if (depth < 1) { return; } if (depth > MaxAllocationProfilingBacktraceDepth) { depth = MaxAllocationProfilingBacktraceDepth; } ProfilingBacktraceDepth_.store(depth); } Y_FORCE_INLINE size_t GetMinProfilingBytesUsedToReport() { return MinProfilingBytesUsedToReport_.load(); } void SetMinProfilingBytesUsedToReport(size_t size) { MinProfilingBytesUsedToReport_.store(size); } void SetEnableEagerMemoryRelease(bool value) { EnableEagerMemoryRelease_.store(value); } bool GetEnableEagerMemoryRelease() { return EnableEagerMemoryRelease_.load(std::memory_order_relaxed); } void SetEnableMadvisePopulate(bool value) { EnableMadvisePopulate_.store(value); } bool GetEnableMadvisePopulate() { return EnableMadvisePopulate_.load(std::memory_order_relaxed); } void EnableStockpile() { StockpileEnabled_.store(true); } bool IsStockpileEnabled() { return StockpileEnabled_.load(); } void SetStockpileInterval(TDuration value) { StockpileInterval_.store(value); } TDuration GetStockpileInterval() { return StockpileInterval_.load(); } void SetStockpileThreadCount(int count) { StockpileThreadCount_.store(count); } int GetStockpileThreadCount() { return ClampVal(StockpileThreadCount_.load(), 0, MaxStockpileThreadCount); } void SetStockpileSize(size_t value) { StockpileSize_.store(value); } size_t GetStockpileSize() { return StockpileSize_.load(); } private: std::atomic LargeUnreclaimableCoeff_ = 0.05; std::atomic MinLargeUnreclaimableBytes_ = 128_MB; std::atomic MaxLargeUnreclaimableBytes_ = 10_GB; std::atomic TimingEventThresholdNs_ = 10000000; // in ns, 10 ms by default std::atomic AllocationProfilingEnabled_ = false; std::atomic AllocationProfilingSamplingRate_ = 1.0; std::atomic AllocationProfilingSamplingRateX64K_ = std::numeric_limits::max(); std::array, SmallRankCount> SmallArenaAllocationProfilingEnabled_ = {}; std::array, LargeRankCount> LargeArenaAllocationProfilingEnabled_ = {}; std::atomic ProfilingBacktraceDepth_ = 10; std::atomic MinProfilingBytesUsedToReport_ = 1_MB; std::atomic EnableEagerMemoryRelease_ = true; std::atomic EnableMadvisePopulate_ = false; std::atomic StockpileEnabled_ = false; std::atomic StockpileInterval_ = TDuration::MilliSeconds(10); static constexpr int MaxStockpileThreadCount = 8; std::atomic StockpileThreadCount_ = 4; std::atomic StockpileSize_ = 1_GB; private: bool IsAllocationSampled() { Y_POD_STATIC_THREAD(ui16) Counter; return Counter++ < AllocationProfilingSamplingRateX64K_.load(); } }; TExplicitlyConstructableSingleton ConfigurationManager; //////////////////////////////////////////////////////////////////////////////// template class TEventLogManagerBase { public: void DisableForCurrentThread() { TManager::DisabledForCurrentThread_ = true; } template void EnqueueEvent(TArgs&&... args) { if (TManager::DisabledForCurrentThread_) { return; } auto timestamp = TInstant::Now(); auto fiberId = NYTAlloc::GetCurrentFiberId(); auto guard = Guard(EventLock_); auto event = TEvent(args...); OnEvent(event); if (EventCount_ >= EventBufferSize) { return; } auto& enqueuedEvent = Events_[EventCount_++]; enqueuedEvent = std::move(event); enqueuedEvent.Timestamp = timestamp; enqueuedEvent.FiberId = fiberId; } void RunBackgroundTasks() { if (LogManager->IsLoggingEnabled()) { for (const auto& event : PullEvents()) { ProcessEvent(event); } } } protected: NThreading::TForkAwareSpinLock EventLock_; virtual void OnEvent(const TEvent& event) = 0; virtual void ProcessEvent(const TEvent& event) = 0; private: static constexpr size_t EventBufferSize = 1000; size_t EventCount_ = 0; std::array Events_; std::vector PullEvents() { std::vector events; events.reserve(EventBufferSize); auto guard = Guard(EventLock_); for (size_t index = 0; index < EventCount_; ++index) { events.push_back(Events_[index]); } EventCount_ = 0; return events; } }; //////////////////////////////////////////////////////////////////////////////// struct TTimingEvent { ETimingEventType Type; TDuration Duration; size_t Size; TInstant Timestamp; TFiberId FiberId; TTimingEvent() { } TTimingEvent( ETimingEventType type, TDuration duration, size_t size) : Type(type) , Duration(duration) , Size(size) { } }; class TTimingManager : public TEventLogManagerBase { public: TEnumIndexedArray GetTimingEventCounters() { auto guard = Guard(EventLock_); return EventCounters_; } private: TEnumIndexedArray EventCounters_; Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_; friend class TEventLogManagerBase; virtual void OnEvent(const TTimingEvent& event) override { auto& counters = EventCounters_[event.Type]; counters.Count += 1; counters.Size += event.Size; } virtual void ProcessEvent(const TTimingEvent& event) override { YTALLOC_LOG_DEBUG("Timing event logged (Type: %s, Duration: %s, Size: %zu, Timestamp: %s, FiberId: %" PRIu64 ")", ToString(event.Type).c_str(), ToString(event.Duration).c_str(), event.Size, ToString(event.Timestamp).c_str(), event.FiberId); } }; Y_POD_THREAD(bool) TTimingManager::DisabledForCurrentThread_; TExplicitlyConstructableSingleton TimingManager; //////////////////////////////////////////////////////////////////////////////// i64 GetElapsedNs(const struct timespec& startTime, const struct timespec& endTime) { if (Y_LIKELY(startTime.tv_sec == endTime.tv_sec)) { return static_cast(endTime.tv_nsec) - static_cast(startTime.tv_nsec); } return static_cast(endTime.tv_nsec) - static_cast(startTime.tv_nsec) + (static_cast(endTime.tv_sec) - static_cast(startTime.tv_sec)) * 1000000000; } // Used to log statistics about long-running syscalls and lock acquisitions. class TTimingGuard : public TNonCopyable { public: explicit TTimingGuard(ETimingEventType eventType, size_t size = 0) : EventType_(eventType) , Size_(size) { ::clock_gettime(CLOCK_MONOTONIC, &StartTime_); } ~TTimingGuard() { auto elapsedNs = GetElapsedNs(); if (elapsedNs > ConfigurationManager->GetTimingEventThresholdNs()) { TimingManager->EnqueueEvent(EventType_, TDuration::MicroSeconds(elapsedNs / 1000), Size_); } } private: const ETimingEventType EventType_; const size_t Size_; struct timespec StartTime_; i64 GetElapsedNs() const { struct timespec endTime; ::clock_gettime(CLOCK_MONOTONIC, &endTime); return NYTAlloc::GetElapsedNs(StartTime_, endTime); } }; template Y_FORCE_INLINE TGuard GuardWithTiming(const T& lock) { TTimingGuard timingGuard(ETimingEventType::Locking); TGuard lockingGuard(lock); return lockingGuard; } //////////////////////////////////////////////////////////////////////////////// // A wrapper for mmap, mumap, and madvise calls. // The latter are invoked with MADV_POPULATE (if enabled) and MADV_FREE flags // and may fail if the OS support is missing. These failures are logged (once) and // handled as follows: // * if MADV_POPULATE fails then we fallback to manual per-page prefault // for all subsequent attempts; // * if MADV_FREE fails then it (and all subsequent attempts) is replaced with MADV_DONTNEED // (which is non-lazy and is less efficient but will somehow do). // Also this class mlocks all VMAs on startup to prevent pagefaults in our heavy binaries // from disturbing latency tails. class TMappedMemoryManager { public: void* Map(uintptr_t hint, size_t size, int flags) { TTimingGuard timingGuard(ETimingEventType::Mmap, size); auto* result = ::mmap( reinterpret_cast(hint), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | flags, -1, 0); if (result == MAP_FAILED) { auto error = errno; if (error == EEXIST && (flags & MAP_FIXED_NOREPLACE)) { // Caller must retry with different hint address. return result; } YTALLOC_VERIFY(error == ENOMEM); ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to mmap %zu bytes\n", size); OomTrap(); } return result; } void Unmap(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::Munmap, size); auto result = ::munmap(ptr, size); YTALLOC_VERIFY(result == 0); } void DontDump(void* ptr, size_t size) { auto result = ::madvise(ptr, size, MADV_DONTDUMP); // Must not fail. YTALLOC_VERIFY(result == 0); } void PopulateFile(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::FilePrefault, size); auto* begin = static_cast(ptr); for (auto* current = begin; current < begin + size; current += PageSize) { *current; } } void PopulateReadOnly(void* ptr, size_t size) { if (!MadvisePopulateUnavailable_.load(std::memory_order_relaxed) && ConfigurationManager->GetEnableMadvisePopulate()) { if (!TryMadvisePopulate(ptr, size)) { MadvisePopulateUnavailable_.store(true); } } } void Populate(void* ptr, size_t size) { if (MadvisePopulateUnavailable_.load(std::memory_order_relaxed) || !ConfigurationManager->GetEnableMadvisePopulate()) { DoPrefault(ptr, size); } else if (!TryMadvisePopulate(ptr, size)) { MadvisePopulateUnavailable_.store(true); DoPrefault(ptr, size); } } void Release(void* ptr, size_t size) { if (CanUseMadviseFree() && !ConfigurationManager->GetEnableEagerMemoryRelease()) { DoMadviseFree(ptr, size); } else { DoMadviseDontNeed(ptr, size); } } bool Stockpile(size_t size) { if (MadviseStockpileUnavailable_.load(std::memory_order_relaxed)) { return false; } if (!TryMadviseStockpile(size)) { MadviseStockpileUnavailable_.store(true); return false; } return true; } void RunBackgroundTasks() { if (!LogManager->IsLoggingEnabled()) { return; } if (IsBuggyKernel() && !BuggyKernelLogged_) { YTALLOC_LOG_WARNING("Kernel is buggy; see KERNEL-118"); BuggyKernelLogged_ = true; } if (MadviseFreeSupported_ && !MadviseFreeSupportedLogged_) { YTALLOC_LOG_INFO("MADV_FREE is supported"); MadviseFreeSupportedLogged_ = true; } if (MadviseFreeNotSupported_ && !MadviseFreeNotSupportedLogged_) { YTALLOC_LOG_WARNING("MADV_FREE is not supported"); MadviseFreeNotSupportedLogged_ = true; } if (MadvisePopulateUnavailable_.load() && !MadvisePopulateUnavailableLogged_) { YTALLOC_LOG_WARNING("MADV_POPULATE is not supported"); MadvisePopulateUnavailableLogged_ = true; } if (MadviseStockpileUnavailable_.load() && !MadviseStockpileUnavailableLogged_) { YTALLOC_LOG_WARNING("MADV_STOCKPILE is not supported"); MadviseStockpileUnavailableLogged_ = true; } } private: bool BuggyKernelLogged_ = false; std::atomic MadviseFreeSupported_ = false; bool MadviseFreeSupportedLogged_ = false; std::atomic MadviseFreeNotSupported_ = false; bool MadviseFreeNotSupportedLogged_ = false; std::atomic MadvisePopulateUnavailable_ = false; bool MadvisePopulateUnavailableLogged_ = false; std::atomic MadviseStockpileUnavailable_ = false; bool MadviseStockpileUnavailableLogged_ = false; private: bool TryMadvisePopulate(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::MadvisePopulate, size); auto result = ::madvise(ptr, size, MADV_POPULATE); if (result != 0) { auto error = errno; YTALLOC_VERIFY(error == EINVAL || error == ENOMEM); if (error == ENOMEM) { ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to madvise(MADV_POPULATE) %zu bytes\n", size); OomTrap(); } return false; } return true; } void DoPrefault(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::Prefault, size); auto* begin = static_cast(ptr); for (auto* current = begin; current < begin + size; current += PageSize) { *current = 0; } } bool CanUseMadviseFree() { if (MadviseFreeSupported_.load()) { return true; } if (MadviseFreeNotSupported_.load()) { return false; } if (IsBuggyKernel()) { MadviseFreeNotSupported_.store(true); } else { auto* ptr = Map(0, PageSize, 0); if (::madvise(ptr, PageSize, MADV_FREE) == 0) { MadviseFreeSupported_.store(true); } else { MadviseFreeNotSupported_.store(true); } Unmap(ptr, PageSize); } // Will not recurse. return CanUseMadviseFree(); } void DoMadviseDontNeed(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::MadviseDontNeed, size); auto result = ::madvise(ptr, size, MADV_DONTNEED); if (result != 0) { auto error = errno; // Failure is possible for locked pages. Y_ABORT_UNLESS(error == EINVAL); } } void DoMadviseFree(void* ptr, size_t size) { TTimingGuard timingGuard(ETimingEventType::MadviseFree, size); auto result = ::madvise(ptr, size, MADV_FREE); if (result != 0) { auto error = errno; // Failure is possible for locked pages. YTALLOC_VERIFY(error == EINVAL); } } bool TryMadviseStockpile(size_t size) { auto result = ::madvise(nullptr, size, MADV_STOCKPILE); if (result != 0) { auto error = errno; if (error == ENOMEM || error == EAGAIN || error == EINTR) { // The call is advisory, ignore ENOMEM, EAGAIN, and EINTR. return true; } YTALLOC_VERIFY(error == EINVAL); return false; } return true; } // Some kernels are known to contain bugs in MADV_FREE; see https://st.yandex-team.ru/KERNEL-118. bool IsBuggyKernel() { #ifdef _linux_ static const bool result = [] () { struct utsname buf; YTALLOC_VERIFY(uname(&buf) == 0); if (strverscmp(buf.release, "4.4.1-1") >= 0 && strverscmp(buf.release, "4.4.96-44") < 0) { return true; } if (strverscmp(buf.release, "4.14.1-1") >= 0 && strverscmp(buf.release, "4.14.79-33") < 0) { return true; } return false; }(); return result; #else return false; #endif } }; TExplicitlyConstructableSingleton MappedMemoryManager; //////////////////////////////////////////////////////////////////////////////// // System allocator // Each system allocation is prepended with such a header. struct TSystemBlobHeader { explicit TSystemBlobHeader(size_t size) : Size(size) { } size_t Size; char Padding[8]; }; CHECK_HEADER_ALIGNMENT(TSystemBlobHeader) // Used for some internal allocations. // Delgates directly to TMappedMemoryManager. class TSystemAllocator { public: void* Allocate(size_t size); void Free(void* ptr); private: std::atomic CurrentPtr_ = SystemZoneStart; }; TExplicitlyConstructableSingleton SystemAllocator; //////////////////////////////////////////////////////////////////////////////// // Deriving from this class makes instances bound to TSystemAllocator. struct TSystemAllocatable { void* operator new(size_t size) noexcept { return SystemAllocator->Allocate(size); } void* operator new[](size_t size) noexcept { return SystemAllocator->Allocate(size); } void operator delete(void* ptr) noexcept { SystemAllocator->Free(ptr); } void operator delete[](void* ptr) noexcept { SystemAllocator->Free(ptr); } }; //////////////////////////////////////////////////////////////////////////////// // Maintains a pool of objects. // Objects are allocated in groups each containing BatchSize instances. // The actual allocation is carried out by TSystemAllocator. // Memory is never actually reclaimed; freed instances are put into TFreeList. template class TSystemPool { public: T* Allocate() { while (true) { auto* obj = FreeList_.Extract(); if (Y_LIKELY(obj)) { new (obj) T(); return obj; } AllocateMore(); } } void Free(T* obj) { obj->T::~T(); PoisonFreedRange(obj, sizeof(T)); FreeList_.Put(obj); } private: TFreeList FreeList_; private: void AllocateMore() { auto* objs = static_cast(SystemAllocator->Allocate(sizeof(T) * BatchSize)); for (size_t index = 0; index < BatchSize; ++index) { auto* obj = objs + index; FreeList_.Put(obj); } } }; // A sharded analogue TSystemPool. template class TShardedSystemPool { public: template T* Allocate(TState* state) { if (auto* obj = FreeLists_[state->GetInitialShardIndex()].Extract()) { new (obj) T(); return obj; } while (true) { for (size_t index = 0; index < ShardCount; ++index) { if (auto* obj = FreeLists_[state->GetNextShardIndex()].Extract()) { new (obj) T(); return obj; } } AllocateMore(); } } template void Free(TState* state, T* obj) { obj->T::~T(); PoisonFreedRange(obj, sizeof(T)); FreeLists_[state->GetInitialShardIndex()].Put(obj); } private: std::array, ShardCount> FreeLists_; private: void AllocateMore() { auto* objs = static_cast(SystemAllocator->Allocate(sizeof(T) * BatchSize)); for (size_t index = 0; index < BatchSize; ++index) { auto* obj = objs + index; FreeLists_[index % ShardCount].Put(obj); } } }; //////////////////////////////////////////////////////////////////////////////// // Handles allocations inside a zone of memory given by its start and end pointers. // Each allocation is a separate mapped region of memory. // A special care is taken to guarantee that all allocated regions fall inside the zone. class TZoneAllocator { public: TZoneAllocator(uintptr_t zoneStart, uintptr_t zoneEnd) : ZoneStart_(zoneStart) , ZoneEnd_(zoneEnd) , Current_(zoneStart) { YTALLOC_VERIFY(ZoneStart_ % PageSize == 0); } void* Allocate(size_t size, int flags) { YTALLOC_VERIFY(size % PageSize == 0); bool restarted = false; while (true) { auto hint = (Current_ += size) - size; if (reinterpret_cast(hint) + size > ZoneEnd_) { if (restarted) { ::fprintf(stderr, "*** YTAlloc was unable to mmap %zu bytes in zone %" PRIx64 "--%" PRIx64 "\n", size, ZoneStart_, ZoneEnd_); OomTrap(); } restarted = true; Current_ = ZoneStart_; } else { char* ptr = static_cast(MappedMemoryManager->Map( hint, size, MAP_FIXED_NOREPLACE | flags)); if (reinterpret_cast(ptr) == hint) { return ptr; } if (ptr != MAP_FAILED) { MappedMemoryManager->Unmap(ptr, size); } } } } void Free(void* ptr, size_t size) { MappedMemoryManager->Unmap(ptr, size); } private: const uintptr_t ZoneStart_; const uintptr_t ZoneEnd_; std::atomic Current_; }; //////////////////////////////////////////////////////////////////////////////// // YTAlloc supports tagged allocations. // Since the total number of tags can be huge, a two-level scheme is employed. // Possible tags are arranged into sets each containing TaggedCounterSetSize tags. // There are up to MaxTaggedCounterSets in total. // Upper 4 sets are reserved for profiled allocations. constexpr size_t TaggedCounterSetSize = 16384; constexpr size_t AllocationProfilingTaggedCounterSets = 4; constexpr size_t MaxTaggedCounterSets = 256 + AllocationProfilingTaggedCounterSets; constexpr size_t MaxCapturedAllocationBacktraces = 65000; static_assert( MaxCapturedAllocationBacktraces < AllocationProfilingTaggedCounterSets * TaggedCounterSetSize, "MaxCapturedAllocationBacktraces is too big"); constexpr TMemoryTag AllocationProfilingMemoryTagBase = TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets); constexpr TMemoryTag AllocationProfilingUnknownMemoryTag = AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces; static_assert( MaxMemoryTag == TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets) - 1, "Wrong MaxMemoryTag"); template using TUntaggedTotalCounters = TEnumIndexedArray; template struct TTaggedTotalCounterSet : public TSystemAllocatable { std::array, TaggedCounterSetSize> Counters; }; using TLocalTaggedBasicCounterSet = TTaggedTotalCounterSet; using TGlobalTaggedBasicCounterSet = TTaggedTotalCounterSet>; template struct TTotalCounters { // The sum of counters across all tags. TUntaggedTotalCounters CumulativeTaggedCounters; // Counters for untagged allocations. TUntaggedTotalCounters UntaggedCounters; // Access to tagged counters may involve creation of a new tag set. // For simplicity, we separate the read-side (TaggedCounterSets) and the write-side (TaggedCounterSetHolders). // These arrays contain virtually identical data (up to std::unique_ptr and std::atomic semantic differences). std::array*>, MaxTaggedCounterSets> TaggedCounterSets{}; std::array>, MaxTaggedCounterSets> TaggedCounterSetHolders; // Protects TaggedCounterSetHolders from concurrent updates. NThreading::TForkAwareSpinLock TaggedCounterSetsLock; // Returns null if the set is not yet constructed. Y_FORCE_INLINE TTaggedTotalCounterSet* FindTaggedCounterSet(size_t index) const { return TaggedCounterSets[index].load(); } // Constructs the set on first access. TTaggedTotalCounterSet* GetOrCreateTaggedCounterSet(size_t index) { auto* set = TaggedCounterSets[index].load(); if (Y_LIKELY(set)) { return set; } auto guard = GuardWithTiming(TaggedCounterSetsLock); auto& setHolder = TaggedCounterSetHolders[index]; if (!setHolder) { setHolder = std::make_unique>(); TaggedCounterSets[index] = setHolder.get(); } return setHolder.get(); } }; using TLocalSystemCounters = TEnumIndexedArray; using TGlobalSystemCounters = TEnumIndexedArray>; using TLocalSmallCounters = TEnumIndexedArray; using TGlobalSmallCounters = TEnumIndexedArray>; using TLocalLargeCounters = TEnumIndexedArray; using TGlobalLargeCounters = TEnumIndexedArray>; using TLocalHugeCounters = TEnumIndexedArray; using TGlobalHugeCounters = TEnumIndexedArray>; using TLocalUndumpableCounters = TEnumIndexedArray; using TGlobalUndumpableCounters = TEnumIndexedArray>; Y_FORCE_INLINE ssize_t LoadCounter(ssize_t counter) { return counter; } Y_FORCE_INLINE ssize_t LoadCounter(const std::atomic& counter) { return counter.load(); } //////////////////////////////////////////////////////////////////////////////// struct TMmapObservationEvent { size_t Size; std::array Frames; int FrameCount; TInstant Timestamp; TFiberId FiberId; TMmapObservationEvent() = default; TMmapObservationEvent( size_t size, std::array frames, int frameCount) : Size(size) , Frames(frames) , FrameCount(frameCount) { } }; class TMmapObservationManager : public TEventLogManagerBase { public: void SetBacktraceFormatter(TBacktraceFormatter formatter) { BacktraceFormatter_.store(formatter); } private: std::atomic BacktraceFormatter_ = nullptr; Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_; friend class TEventLogManagerBase; virtual void OnEvent(const TMmapObservationEvent& /*event*/) override { } virtual void ProcessEvent(const TMmapObservationEvent& event) override { YTALLOC_LOG_DEBUG("Large arena mmap observed (Size: %zu, Timestamp: %s, FiberId: %" PRIx64 ")", event.Size, ToString(event.Timestamp).c_str(), event.FiberId); if (auto backtraceFormatter = BacktraceFormatter_.load()) { auto backtrace = backtraceFormatter(const_cast(event.Frames.data()), event.FrameCount); YTALLOC_LOG_DEBUG("YTAlloc stack backtrace (Stack: %s)", backtrace.c_str()); } } }; Y_POD_THREAD(bool) TMmapObservationManager::DisabledForCurrentThread_; TExplicitlyConstructableSingleton MmapObservationManager; //////////////////////////////////////////////////////////////////////////////// // A per-thread structure containing counters, chunk caches etc. struct TThreadState : public TFreeListItemBase , public TLocalShardedState { // TThreadState instances of all alive threads are put into a double-linked intrusive list. // This is a pair of next/prev pointers connecting an instance of TThreadState to its neighbors. TIntrusiveLinkedListNode RegistryNode; // Pointers to the respective parts of TThreadManager::ThreadControlWord_. // If null then the thread is already destroyed (but TThreadState may still live for a while // due to ref-counting). ui8* AllocationProfilingEnabled; ui8* BackgroundThreadStarted; // TThreadStates are ref-counted. // TThreadManager::EnumerateThreadStates enumerates the registered states and acquires // a temporary reference preventing these states from being destructed. This provides // for shorter periods of time the global lock needs to be held. int RefCounter = 1; // Per-thread counters. TTotalCounters TotalCounters; std::array LargeArenaCounters; TLocalUndumpableCounters UndumpableCounters; // Each thread maintains caches of small chunks. // One cache is for tagged chunks; the other is for untagged ones. // Each cache contains up to MaxCachedChunksPerRank chunks per any rank. // Special sentinels are placed to distinguish the boundaries of region containing // pointers of a specific rank. This enables a tiny-bit faster inplace boundary checks. static constexpr uintptr_t LeftSentinel = 1; static constexpr uintptr_t RightSentinel = 2; struct TSmallBlobCache { TSmallBlobCache() { void** chunkPtrs = CachedChunks.data(); for (size_t rank = 0; rank < SmallRankCount; ++rank) { RankToCachedChunkPtrHead[rank] = chunkPtrs; chunkPtrs[0] = reinterpret_cast(LeftSentinel); chunkPtrs[MaxCachedChunksPerRank + 1] = reinterpret_cast(RightSentinel); #ifdef YTALLOC_PARANOID RankToCachedChunkPtrTail[rank] = chunkPtrs; CachedChunkFull[rank] = false; RankToCachedChunkLeftBorder[rank] = chunkPtrs; RankToCachedChunkRightBorder[rank] = chunkPtrs + MaxCachedChunksPerRank + 1; #endif chunkPtrs += MaxCachedChunksPerRank + 2; } } // For each rank we have a segment of pointers in CachedChunks with the following layout: // LCC[C]........R // Legend: // . = garbage // L = left sentinel // R = right sentinel // C = cached pointer // [C] = current cached pointer // // Under YTALLOC_PARANOID the following layout is used: // L.[T]CCC[H]...R // Legend: // [H] = head cached pointer, put chunks here // [T] = tail cached pointer, take chunks from here // +2 is for two sentinels std::array CachedChunks{}; // Pointer to [P] for each rank. std::array RankToCachedChunkPtrHead{}; #ifdef YTALLOC_PARANOID // Pointers to [L] and [R] for each rank. std::array RankToCachedChunkLeftBorder{}; std::array RankToCachedChunkRightBorder{}; std::array RankToCachedChunkPtrTail{}; std::array CachedChunkFull{}; #endif }; TEnumIndexedArray SmallBlobCache; }; struct TThreadStateToRegistryNode { auto operator() (TThreadState* state) const { return &state->RegistryNode; } }; // Manages all registered threads and controls access to TThreadState. class TThreadManager { public: TThreadManager() { pthread_key_create(&ThreadDtorKey_, DestroyThread); NThreading::RegisterAtForkHandlers( nullptr, nullptr, [=] { AfterFork(); }); } // Returns TThreadState for the current thread; the caller guarantees that this // state is initialized and is not destroyed yet. static TThreadState* GetThreadStateUnchecked(); // Returns TThreadState for the current thread; may return null. static TThreadState* FindThreadState(); // Returns TThreadState for the current thread; may not return null // (but may crash if TThreadState is already destroyed). static TThreadState* GetThreadStateChecked() { auto* state = FindThreadState(); YTALLOC_VERIFY(state); return state; } // Enumerates all threads and invokes func passing TThreadState instances. // func must not throw but can take arbitrary time; no locks are being held while it executes. template void EnumerateThreadStatesAsync(const THandler& handler) noexcept { TMemoryTagGuard guard(NullMemoryTag); std::vector states; states.reserve(1024); // must be enough in most cases auto unrefStates = [&] { // Releasing references also requires global lock to be held to avoid getting zombies above. auto guard = GuardWithTiming(ThreadRegistryLock_); for (auto* state : states) { UnrefThreadState(state); } }; auto tryRefStates = [&] { // Only hold this guard for a small period of time to reference all the states. auto guard = GuardWithTiming(ThreadRegistryLock_); auto* current = ThreadRegistry_.GetFront(); while (current) { if (states.size() == states.capacity()) { // Cannot allocate while holding ThreadRegistryLock_ due to a possible deadlock as follows: // EnumerateThreadStatesAsync -> StartBackgroundThread -> EnumerateThreadStatesSync // (many other scenarios are also possible). guard.Release(); unrefStates(); states.clear(); states.reserve(states.capacity() * 2); return false; } RefThreadState(current); states.push_back(current); current = current->RegistryNode.Next; } return true; }; while (!tryRefStates()) ; for (auto* state : states) { handler(state); } unrefStates(); } // Similar to EnumerateThreadStatesAsync but holds the global lock while enumerating the threads. // Also invokes a given prologue functor while holding the thread registry lock. // Handler and prologue calls must be fast and must not allocate. template void EnumerateThreadStatesSync(const TPrologue& prologue, const THandler& handler) noexcept { auto guard = GuardWithTiming(ThreadRegistryLock_); prologue(); auto* current = ThreadRegistry_.GetFront(); while (current) { handler(current); current = current->RegistryNode.Next; } } // We store a special 64-bit "thread control word" in TLS encapsulating the following // crucial per-thread parameters: // * the current memory tag // * a flag indicating that a valid TThreadState is known to exists // (and can be obtained via GetThreadStateUnchecked) // * a flag indicating that allocation profiling is enabled // * a flag indicating that background thread is started // Thread control word is fetched via GetThreadControlWord and is compared // against FastPathControlWord to see if the fast path can be taken. // The latter happens when no memory tagging is configured, TThreadState is // valid, allocation profiling is disabled, and background thread is started. // The mask for extracting memory tag from thread control word. static constexpr ui64 MemoryTagControlWordMask = 0xffffffff; // ThreadStateValid is on. static constexpr ui64 ThreadStateValidControlWordMask = (1ULL << 32); // AllocationProfiling is on. static constexpr ui64 AllocationProfilingEnabledControlWordMask = (1ULL << 40); // All background thread are properly started. static constexpr ui64 BackgroundThreadStartedControlWorkMask = (1ULL << 48); // Memory tag is NullMemoryTag; thread state is valid. static constexpr ui64 FastPathControlWord = BackgroundThreadStartedControlWorkMask | ThreadStateValidControlWordMask | NullMemoryTag; Y_FORCE_INLINE static ui64 GetThreadControlWord() { return (&ThreadControlWord_)->Value; } static TMemoryTag GetCurrentMemoryTag() { return (&ThreadControlWord_)->Parts.MemoryTag; } static void SetCurrentMemoryTag(TMemoryTag tag) { Y_ABORT_UNLESS(tag <= MaxMemoryTag); (&ThreadControlWord_)->Parts.MemoryTag = tag; } static EMemoryZone GetCurrentMemoryZone() { return CurrentMemoryZone_; } static void SetCurrentMemoryZone(EMemoryZone zone) { CurrentMemoryZone_ = zone; } static void SetCurrentFiberId(TFiberId id) { CurrentFiberId_ = id; } static TFiberId GetCurrentFiberId() { return CurrentFiberId_; } private: static void DestroyThread(void*); TThreadState* AllocateThreadState(); void RefThreadState(TThreadState* state) { auto result = ++state->RefCounter; Y_ABORT_UNLESS(result > 1); } void UnrefThreadState(TThreadState* state) { auto result = --state->RefCounter; Y_ABORT_UNLESS(result >= 0); if (result == 0) { DestroyThreadState(state); } } void DestroyThreadState(TThreadState* state); void AfterFork(); private: // TThreadState instance for the current thread. // Initially null, then initialized when first needed. // TThreadState is destroyed upon thread termination (which is detected with // the help of pthread_key_create machinery), so this pointer can become null again. Y_POD_STATIC_THREAD(TThreadState*) ThreadState_; // Initially false, then set to true then TThreadState is destroyed. // If the thread requests for its state afterwards, null is returned and no new state is (re-)created. // The caller must be able to deal with it. Y_POD_STATIC_THREAD(bool) ThreadStateDestroyed_; union TThreadControlWord { ui64 __attribute__((__may_alias__)) Value; struct TParts { // The current memory tag used in all allocations by this thread. ui32 __attribute__((__may_alias__)) MemoryTag; // Indicates if a valid TThreadState exists and can be obtained via GetThreadStateUnchecked. ui8 __attribute__((__may_alias__)) ThreadStateValid; // Indicates if allocation profiling is on. ui8 __attribute__((__may_alias__)) AllocationProfilingEnabled; // Indicates if all background threads are properly started. ui8 __attribute__((__may_alias__)) BackgroundThreadStarted; ui8 Padding[2]; } Parts; }; Y_POD_STATIC_THREAD(TThreadControlWord) ThreadControlWord_; // See memory zone API. Y_POD_STATIC_THREAD(EMemoryZone) CurrentMemoryZone_; // See fiber id API. Y_POD_STATIC_THREAD(TFiberId) CurrentFiberId_; pthread_key_t ThreadDtorKey_; static constexpr size_t ThreadStatesBatchSize = 1; TSystemPool ThreadStatePool_; NThreading::TForkAwareSpinLock ThreadRegistryLock_; TIntrusiveLinkedList ThreadRegistry_; }; Y_POD_THREAD(TThreadState*) TThreadManager::ThreadState_; Y_POD_THREAD(bool) TThreadManager::ThreadStateDestroyed_; Y_POD_THREAD(TThreadManager::TThreadControlWord) TThreadManager::ThreadControlWord_; Y_POD_THREAD(EMemoryZone) TThreadManager::CurrentMemoryZone_; Y_POD_THREAD(TFiberId) TThreadManager::CurrentFiberId_; TExplicitlyConstructableSingleton ThreadManager; //////////////////////////////////////////////////////////////////////////////// void TConfigurationManager::SetAllocationProfilingEnabled(bool value) { // Update threads' TLS. ThreadManager->EnumerateThreadStatesSync( [&] { AllocationProfilingEnabled_.store(value); }, [&] (auto* state) { if (state->AllocationProfilingEnabled) { *state->AllocationProfilingEnabled = value; } }); } //////////////////////////////////////////////////////////////////////////////// // Backtrace Manager // // Captures backtraces observed during allocations and assigns memory tags to them. // Memory tags are chosen sequentially starting from AllocationProfilingMemoryTagBase. // // For each backtrace we compute a 64-bit hash and use it as a key in a certain concurrent hashmap. // This hashmap is organized into BucketCount buckets, each consisting of BucketSize slots. // // Backtrace hash is translated into bucket index by taking the appropriate number of // its lower bits. For each slot, we remember a 32-bit fingerprint, which is // just the next 32 bits of the backtrace's hash, and the (previously assigned) memory tag. // // Upon access to the hashtable, the bucket is first scanned optimistically, without taking // any locks. In case of a miss, a per-bucket spinlock is acquired and the bucket is rescanned. // // The above scheme may involve collisions but we neglect their probability. // // If the whole hash table overflows (i.e. a total of MaxCapturedAllocationBacktraces // backtraces are captured) or the bucket overflows (i.e. all of its slots become occupied), // the allocation is annotated with AllocationProfilingUnknownMemoryTag. Such allocations // appear as having no backtrace whatsoever in the profiling reports. class TBacktraceManager { public: // Sets the provider used for collecting backtraces when allocation profiling // is turned ON. void SetBacktraceProvider(TBacktraceProvider provider) { BacktraceProvider_.store(provider); } // Captures the backtrace and inserts it into the hashtable. TMemoryTag GetMemoryTagFromBacktrace(int framesToSkip) { std::array frames; auto backtraceProvider = BacktraceProvider_.load(); if (!backtraceProvider) { return NullMemoryTag; } auto frameCount = backtraceProvider(frames.data(), ConfigurationManager->GetProfilingBacktraceDepth(), framesToSkip); auto hash = GetBacktraceHash(frames.data(), frameCount); return CaptureBacktrace(hash, frames.data(), frameCount); } // Returns the backtrace corresponding to the given tag, if any. std::optional FindBacktrace(TMemoryTag tag) { if (tag < AllocationProfilingMemoryTagBase || tag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces) { return std::nullopt; } const auto& entry = Backtraces_[tag - AllocationProfilingMemoryTagBase]; if (!entry.Captured.load()) { return std::nullopt; } return entry.Backtrace; } private: static constexpr int Log2BucketCount = 16; static constexpr int BucketCount = 1 << Log2BucketCount; static constexpr int BucketSize = 8; std::atomic BacktraceProvider_ = nullptr; std::array, BucketSize>, BucketCount> Fingerprints_= {}; std::array, BucketSize>, BucketCount> MemoryTags_ = {}; std::array BucketLocks_; std::atomic CurrentMemoryTag_ = AllocationProfilingMemoryTagBase; struct TBacktraceEntry { TBacktrace Backtrace; std::atomic Captured = false; }; std::array Backtraces_; private: static size_t GetBacktraceHash(void** frames, int frameCount) { size_t hash = 0; for (int index = 0; index < frameCount; ++index) { hash = CombineHashes(hash, THash()(frames[index])); } return hash; } TMemoryTag CaptureBacktrace(size_t hash, void** frames, int frameCount) { size_t bucketIndex = hash % BucketCount; ui32 fingerprint = (hash >> Log2BucketCount) & 0xffffffff; // Zero fingerprint indicates the slot is free; check and adjust to ensure // that regular fingerprints are non-zero. if (fingerprint == 0) { fingerprint = 1; } for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) { auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed); if (currentFingerprint == fingerprint) { return MemoryTags_[bucketIndex][slotIndex].load(); } } auto guard = Guard(BucketLocks_[bucketIndex]); int spareSlotIndex = -1; for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) { auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed); if (currentFingerprint == fingerprint) { return MemoryTags_[bucketIndex][slotIndex]; } if (currentFingerprint == 0) { spareSlotIndex = slotIndex; break; } } if (spareSlotIndex < 0) { return AllocationProfilingUnknownMemoryTag; } auto memoryTag = CurrentMemoryTag_++; if (memoryTag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces) { return AllocationProfilingUnknownMemoryTag; } MemoryTags_[bucketIndex][spareSlotIndex].store(memoryTag); Fingerprints_[bucketIndex][spareSlotIndex].store(fingerprint); auto& entry = Backtraces_[memoryTag - AllocationProfilingMemoryTagBase]; entry.Backtrace.FrameCount = frameCount; ::memcpy(entry.Backtrace.Frames.data(), frames, sizeof (void*) * frameCount); entry.Captured.store(true); return memoryTag; } }; TExplicitlyConstructableSingleton BacktraceManager; //////////////////////////////////////////////////////////////////////////////// // Mimics the counters of TThreadState but uses std::atomic to survive concurrent access. struct TGlobalState : public TGlobalShardedState { TTotalCounters> TotalCounters; std::array LargeArenaCounters; TGlobalUndumpableCounters UndumpableCounters; }; TExplicitlyConstructableSingleton GlobalState; //////////////////////////////////////////////////////////////////////////////// // Accumulates various allocation statistics. class TStatisticsManager { public: template static Y_FORCE_INLINE void IncrementTotalCounter(TState* state, TMemoryTag tag, EBasicCounter counter, ssize_t delta) { // This branch is typically resolved at compile time. if (Kind == EAllocationKind::Tagged && tag != NullMemoryTag) { IncrementTaggedTotalCounter(&state->TotalCounters, tag, counter, delta); } else { IncrementUntaggedTotalCounter(&state->TotalCounters, counter, delta); } } static Y_FORCE_INLINE void IncrementTotalCounter(TMemoryTag tag, EBasicCounter counter, ssize_t delta) { IncrementTotalCounter(GlobalState.Get(), tag, counter, delta); } void IncrementSmallArenaCounter(ESmallArenaCounter counter, size_t rank, ssize_t delta) { SmallArenaCounters_[rank][counter] += delta; } template static Y_FORCE_INLINE void IncrementLargeArenaCounter(TState* state, size_t rank, ELargeArenaCounter counter, ssize_t delta) { state->LargeArenaCounters[rank][counter] += delta; } template static Y_FORCE_INLINE void IncrementUndumpableCounter(TState* state, EUndumpableCounter counter, ssize_t delta) { state->UndumpableCounters[counter] += delta; } void IncrementHugeCounter(EHugeCounter counter, ssize_t delta) { HugeCounters_[counter] += delta; } void IncrementHugeUndumpableCounter(EUndumpableCounter counter, ssize_t delta) { HugeUndumpableCounters_[counter] += delta; } void IncrementSystemCounter(ESystemCounter counter, ssize_t delta) { SystemCounters_[counter] += delta; } // Computes memory usage for a list of tags by aggregating counters across threads. void GetTaggedMemoryCounters(const TMemoryTag* tags, size_t count, TEnumIndexedArray* counters) { TMemoryTagGuard guard(NullMemoryTag); for (size_t index = 0; index < count; ++index) { counters[index][EBasicCounter::BytesAllocated] = 0; counters[index][EBasicCounter::BytesFreed] = 0; } for (size_t index = 0; index < count; ++index) { auto tag = tags[index]; counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesAllocated); counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesFreed); } ThreadManager->EnumerateThreadStatesAsync( [&] (const auto* state) { for (size_t index = 0; index < count; ++index) { auto tag = tags[index]; counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesAllocated); counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesFreed); } }); for (size_t index = 0; index < count; ++index) { counters[index][EBasicCounter::BytesUsed] = GetUsed(counters[index][EBasicCounter::BytesAllocated], counters[index][EBasicCounter::BytesFreed]); } } void GetTaggedMemoryUsage(const TMemoryTag* tags, size_t count, size_t* results) { TMemoryTagGuard guard(NullMemoryTag); std::vector> counters; counters.resize(count); GetTaggedMemoryCounters(tags, count, counters.data()); for (size_t index = 0; index < count; ++index) { results[index] = counters[index][EBasicCounter::BytesUsed]; } } TEnumIndexedArray GetTotalAllocationCounters() { TEnumIndexedArray result; auto accumulate = [&] (const auto& counters) { result[ETotalCounter::BytesAllocated] += LoadCounter(counters[EBasicCounter::BytesAllocated]); result[ETotalCounter::BytesFreed] += LoadCounter(counters[EBasicCounter::BytesFreed]); }; accumulate(GlobalState->TotalCounters.UntaggedCounters); accumulate(GlobalState->TotalCounters.CumulativeTaggedCounters); ThreadManager->EnumerateThreadStatesAsync( [&] (const auto* state) { accumulate(state->TotalCounters.UntaggedCounters); accumulate(state->TotalCounters.CumulativeTaggedCounters); }); result[ETotalCounter::BytesUsed] = GetUsed( result[ETotalCounter::BytesAllocated], result[ETotalCounter::BytesFreed]); auto systemCounters = GetSystemAllocationCounters(); result[ETotalCounter::BytesCommitted] += systemCounters[EBasicCounter::BytesUsed]; auto hugeCounters = GetHugeAllocationCounters(); result[ETotalCounter::BytesCommitted] += hugeCounters[EHugeCounter::BytesUsed]; auto smallArenaCounters = GetSmallArenaAllocationCounters(); for (size_t rank = 0; rank < SmallRankCount; ++rank) { result[ETotalCounter::BytesCommitted] += smallArenaCounters[rank][ESmallArenaCounter::BytesCommitted]; } auto largeArenaCounters = GetLargeArenaAllocationCounters(); for (size_t rank = 0; rank < LargeRankCount; ++rank) { result[ETotalCounter::BytesCommitted] += largeArenaCounters[rank][ELargeArenaCounter::BytesCommitted]; } result[ETotalCounter::BytesUnaccounted] = std::max(GetProcessRss() - result[ETotalCounter::BytesCommitted], 0); return result; } TEnumIndexedArray GetSmallAllocationCounters() { TEnumIndexedArray result; auto totalCounters = GetTotalAllocationCounters(); result[ESmallCounter::BytesAllocated] = totalCounters[ETotalCounter::BytesAllocated]; result[ESmallCounter::BytesFreed] = totalCounters[ETotalCounter::BytesFreed]; result[ESmallCounter::BytesUsed] = totalCounters[ETotalCounter::BytesUsed]; auto largeArenaCounters = GetLargeArenaAllocationCounters(); for (size_t rank = 0; rank < LargeRankCount; ++rank) { result[ESmallCounter::BytesAllocated] -= largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated]; result[ESmallCounter::BytesFreed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesFreed]; result[ESmallCounter::BytesUsed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesUsed]; } auto hugeCounters = GetHugeAllocationCounters(); result[ESmallCounter::BytesAllocated] -= hugeCounters[EHugeCounter::BytesAllocated]; result[ESmallCounter::BytesFreed] -= hugeCounters[EHugeCounter::BytesFreed]; result[ESmallCounter::BytesUsed] -= hugeCounters[EHugeCounter::BytesUsed]; return result; } std::array GetSmallArenaAllocationCounters() { std::array result; for (size_t rank = 0; rank < SmallRankCount; ++rank) { for (auto counter : TEnumTraits::GetDomainValues()) { result[rank][counter] = SmallArenaCounters_[rank][counter].load(); } } return result; } TEnumIndexedArray GetLargeAllocationCounters() { TEnumIndexedArray result; auto largeArenaCounters = GetLargeArenaAllocationCounters(); for (size_t rank = 0; rank < LargeRankCount; ++rank) { result[ESmallCounter::BytesAllocated] += largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated]; result[ESmallCounter::BytesFreed] += largeArenaCounters[rank][ELargeArenaCounter::BytesFreed]; result[ESmallCounter::BytesUsed] += largeArenaCounters[rank][ELargeArenaCounter::BytesUsed]; } return result; } std::array GetLargeArenaAllocationCounters() { std::array result{}; for (size_t rank = 0; rank < LargeRankCount; ++rank) { for (auto counter : TEnumTraits::GetDomainValues()) { result[rank][counter] = GlobalState->LargeArenaCounters[rank][counter].load(); } } ThreadManager->EnumerateThreadStatesAsync( [&] (const auto* state) { for (size_t rank = 0; rank < LargeRankCount; ++rank) { for (auto counter : TEnumTraits::GetDomainValues()) { result[rank][counter] += state->LargeArenaCounters[rank][counter]; } } }); for (size_t rank = 0; rank < LargeRankCount; ++rank) { result[rank][ELargeArenaCounter::BytesUsed] = GetUsed(result[rank][ELargeArenaCounter::BytesAllocated], result[rank][ELargeArenaCounter::BytesFreed]); result[rank][ELargeArenaCounter::BlobsUsed] = GetUsed(result[rank][ELargeArenaCounter::BlobsAllocated], result[rank][ELargeArenaCounter::BlobsFreed]); } return result; } TLocalSystemCounters GetSystemAllocationCounters() { TLocalSystemCounters result; for (auto counter : TEnumTraits::GetDomainValues()) { result[counter] = SystemCounters_[counter].load(); } result[ESystemCounter::BytesUsed] = GetUsed(result[ESystemCounter::BytesAllocated], result[ESystemCounter::BytesFreed]); return result; } TLocalHugeCounters GetHugeAllocationCounters() { TLocalHugeCounters result; for (auto counter : TEnumTraits::GetDomainValues()) { result[counter] = HugeCounters_[counter].load(); } result[EHugeCounter::BytesUsed] = GetUsed(result[EHugeCounter::BytesAllocated], result[EHugeCounter::BytesFreed]); result[EHugeCounter::BlobsUsed] = GetUsed(result[EHugeCounter::BlobsAllocated], result[EHugeCounter::BlobsFreed]); return result; } TLocalUndumpableCounters GetUndumpableAllocationCounters() { TLocalUndumpableCounters result; for (auto counter : TEnumTraits::GetDomainValues()) { result[counter] = HugeUndumpableCounters_[counter].load(); result[counter] += GlobalState->UndumpableCounters[counter].load(); } ThreadManager->EnumerateThreadStatesAsync( [&] (const auto* state) { result[EUndumpableCounter::BytesAllocated] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesAllocated]); result[EUndumpableCounter::BytesFreed] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesFreed]); }); result[EUndumpableCounter::BytesUsed] = GetUsed(result[EUndumpableCounter::BytesAllocated], result[EUndumpableCounter::BytesFreed]); return result; } // Called before TThreadState is destroyed. // Adds the counter values from TThreadState to the global counters. void AccumulateLocalCounters(TThreadState* state) { for (auto counter : TEnumTraits::GetDomainValues()) { GlobalState->TotalCounters.CumulativeTaggedCounters[counter] += state->TotalCounters.CumulativeTaggedCounters[counter]; GlobalState->TotalCounters.UntaggedCounters[counter] += state->TotalCounters.UntaggedCounters[counter]; } for (size_t index = 0; index < MaxTaggedCounterSets; ++index) { const auto* localSet = state->TotalCounters.FindTaggedCounterSet(index); if (!localSet) { continue; } auto* globalSet = GlobalState->TotalCounters.GetOrCreateTaggedCounterSet(index); for (size_t jndex = 0; jndex < TaggedCounterSetSize; ++jndex) { for (auto counter : TEnumTraits::GetDomainValues()) { globalSet->Counters[jndex][counter] += localSet->Counters[jndex][counter]; } } } for (size_t rank = 0; rank < LargeRankCount; ++rank) { for (auto counter : TEnumTraits::GetDomainValues()) { GlobalState->LargeArenaCounters[rank][counter] += state->LargeArenaCounters[rank][counter]; } } for (auto counter : TEnumTraits::GetDomainValues()) { GlobalState->UndumpableCounters[counter] += state->UndumpableCounters[counter]; } } private: template static ssize_t LoadTaggedTotalCounter(const TTotalCounters& counters, TMemoryTag tag, EBasicCounter counter) { const auto* set = counters.FindTaggedCounterSet(tag / TaggedCounterSetSize); if (Y_UNLIKELY(!set)) { return 0; } return LoadCounter(set->Counters[tag % TaggedCounterSetSize][counter]); } template static Y_FORCE_INLINE void IncrementUntaggedTotalCounter(TTotalCounters* counters, EBasicCounter counter, ssize_t delta) { counters->UntaggedCounters[counter] += delta; } template static Y_FORCE_INLINE void IncrementTaggedTotalCounter(TTotalCounters* counters, TMemoryTag tag, EBasicCounter counter, ssize_t delta) { counters->CumulativeTaggedCounters[counter] += delta; auto* set = counters->GetOrCreateTaggedCounterSet(tag / TaggedCounterSetSize); set->Counters[tag % TaggedCounterSetSize][counter] += delta; } static ssize_t GetProcessRss() { auto* file = ::fopen("/proc/self/statm", "r"); if (!file) { return 0; } ssize_t dummy; ssize_t rssPages; auto readResult = fscanf(file, "%zd %zd", &dummy, &rssPages); ::fclose(file); if (readResult != 2) { return 0; } return rssPages * PageSize; } private: TGlobalSystemCounters SystemCounters_; std::array SmallArenaCounters_; TGlobalHugeCounters HugeCounters_; TGlobalUndumpableCounters HugeUndumpableCounters_; }; TExplicitlyConstructableSingleton StatisticsManager; //////////////////////////////////////////////////////////////////////////////// void* TSystemAllocator::Allocate(size_t size) { auto rawSize = GetRawBlobSize(size); void* mmappedPtr; while (true) { auto currentPtr = CurrentPtr_.fetch_add(rawSize); Y_ABORT_UNLESS(currentPtr + rawSize <= SystemZoneEnd); mmappedPtr = MappedMemoryManager->Map( currentPtr, rawSize, MAP_FIXED_NOREPLACE | MAP_POPULATE); if (mmappedPtr == reinterpret_cast(currentPtr)) { break; } if (mmappedPtr != MAP_FAILED) { MappedMemoryManager->Unmap(mmappedPtr, rawSize); } } auto* blob = static_cast(mmappedPtr); new (blob) TSystemBlobHeader(size); auto* result = HeaderToPtr(blob); PoisonUninitializedRange(result, size); StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, rawSize); return result; } void TSystemAllocator::Free(void* ptr) { auto* blob = PtrToHeader(ptr); auto rawSize = GetRawBlobSize(blob->Size); MappedMemoryManager->Unmap(blob, rawSize); StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesFreed, rawSize); } //////////////////////////////////////////////////////////////////////////////// // Small allocator // // Allocations (called small chunks) are grouped by their sizes. Two most-significant binary digits are // used to determine the rank of a chunk, which guarantees 25% overhead in the worst case. // A pair of helper arrays (SizeToSmallRank1 and SizeToSmallRank2) are used to compute ranks; we expect // them to be permanently cached. // // Chunks of the same rank are served by a (small) arena allocator. // In fact, there are two arenas for each rank: one is for tagged allocations and another is for untagged ones. // // We encode chunk's rank and whether it is tagged or not in the resulting pointer as follows: // 0- 3: must be zero due to alignment // 4-39: varies // 40-44: rank // 45: 0 for untagged allocations, 1 for tagged ones // 45-63: zeroes // This enables computing chunk's rank and also determining if it is tagged in constant time // without any additional lookups. Also, one pays no space overhead for untagged allocations // and pays 16 bytes for each tagged one. // // Each arena allocates extents of memory by calling mmap for each extent of SmallExtentSize bytes. // (Recall that this memory is never reclaimed.) // Each extent is then sliced into segments of SmallSegmentSize bytes. // Whenever a new segment is acquired, its memory is pre-faulted by madvise(MADV_POPULATE). // New segments are acquired in a lock-free manner. // // Each thread maintains a separate cache of chunks of each rank (two caches to be precise: one // for tagged allocations and the other for untagged). These caches are fully thread-local and // involve no atomic operations. // // There are also global caches (per rank, for tagged and untagged allocations). // Instead of keeping individual chunks these work with chunk groups (collections of up to ChunksPerGroup // arbitrary chunks). // // When the local cache becomes exhausted, a group of chunks is fetched from the global cache // (if the latter is empty then the arena allocator is consulted). // Vice versa, if the local cache overflows, a group of chunks is moved from it to the global cache. // // Global caches and arena allocators also take care of (rare) cases when Allocate/Free is called // without a valid thread state (which happens during thread shutdown when TThreadState is already destroyed). // // Each arena allocates memory in a certain "data" zone of SmallZoneSize. // In addition to that zone, up to two "shadow" zones are maintained. // // The first one contains memory tags of chunks residing in the primary zone. // The second one (which is present if YTALLOC_NERVOUS is defined) contains // states of chunks. These states enable some simple internal sanity checks // (e.g. detect attempts to double-free a chunk). // // Addresses in the data zone are directly mapped to offsets in shadow zones. // When a segment of a small arena zone is allocated, the relevant portions of shadow // zones get initialized (and also accounted for as a system allocation). // // Shadow zones are memory-mapped with MAP_NORESERVE flag and are quite sparse. // These zones are omitted from core dumps due to their huge size and sparsity. // For each small rank i, gives max K such that 2^k <= SmallRankToSize[i]. // Chunk pointer is mapped to its shadow image via GetShadowOffset helper. // Note that chunk size is not always a power of 2. To avoid costly integer division, // chunk pointer is translated by means of bitwise shift only (leaving some bytes // of shadow zones unused). This array provides the needed shifts. constexpr int SmallRankToLogSize[SmallRankCount] = { 0, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 }; enum class ESmallChunkState : ui8 { Spare = 0, Allocated = 0x61, // a Freed = 0x66 // f }; class TSmallArenaAllocator { public: TSmallArenaAllocator(EAllocationKind kind, size_t rank, uintptr_t dataZoneStart) : Kind_(kind) , Rank_(rank) , LogSize_(SmallRankToLogSize[Rank_]) , ChunkSize_(SmallRankToSize[Rank_]) , DataZoneStart_(dataZoneStart) , DataZoneAllocator_(DataZoneStart_, DataZoneStart_ + SmallZoneSize) { } size_t PullMany(void** batch, size_t maxCount) { size_t count; while (true) { count = TryAllocateFromCurrentExtent(batch, maxCount); if (Y_LIKELY(count != 0)) { break; } PopulateAnotherExtent(); } return count; } void* Allocate(size_t size) { void* ptr; auto count = PullMany(&ptr, 1); YTALLOC_PARANOID_ASSERT(count == 1); YTALLOC_PARANOID_ASSERT(PtrToSmallRank(ptr) == Rank_); PoisonUninitializedRange(ptr, size); UpdateChunkState(ptr, ESmallChunkState::Freed, ESmallChunkState::Allocated); return ptr; } TMemoryTag GetAndResetMemoryTag(const void* ptr) { auto& tag = MemoryTagZoneStart_[GetShadowOffset(ptr)]; auto currentTag = tag; tag = NullMemoryTag; return currentTag; } void SetMemoryTag(void* ptr, TMemoryTag tag) { MemoryTagZoneStart_[GetShadowOffset(ptr)] = tag; } void UpdateChunkState(const void* ptr, ESmallChunkState expectedState, ESmallChunkState newState) { #ifdef YTALLOC_NERVOUS auto& state = ChunkStateZoneStart_[GetShadowOffset(ptr)]; auto actualState = state; if (Y_UNLIKELY(actualState != expectedState)) { char message[256]; snprintf(message, sizeof(message), "Invalid small chunk state at %p: expected %" PRIx8 ", actual %" PRIx8, ptr, static_cast(expectedState), static_cast(actualState)); YTALLOC_TRAP(message); } state = newState; #else Y_UNUSED(ptr); Y_UNUSED(expectedState); Y_UNUSED(newState); #endif } private: size_t TryAllocateFromCurrentExtent(void** batch, size_t maxCount) { auto* oldPtr = CurrentPtr_.load(); if (Y_UNLIKELY(!oldPtr)) { return 0; } auto* currentExtent = CurrentExtent_.load(std::memory_order_relaxed); if (Y_UNLIKELY(!currentExtent)) { return 0; } char* newPtr; while (true) { if (Y_UNLIKELY(oldPtr < currentExtent || oldPtr + ChunkSize_ + RightReadableAreaSize > currentExtent + SmallExtentSize)) { return 0; } newPtr = std::min( oldPtr + ChunkSize_ * maxCount, currentExtent + SmallExtentSize); auto* alignedNewPtr = AlignDownToSmallSegment(currentExtent, newPtr); if (alignedNewPtr > oldPtr) { newPtr = alignedNewPtr; } if (Y_LIKELY(CurrentPtr_.compare_exchange_weak(oldPtr, newPtr))) { break; } } auto* firstSegment = AlignUpToSmallSegment(currentExtent, oldPtr); auto* nextSegment = AlignUpToSmallSegment(currentExtent, newPtr); if (firstSegment != nextSegment) { auto size = nextSegment - firstSegment; MappedMemoryManager->PopulateReadOnly(firstSegment, size); StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesCommitted, Rank_, size); StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesCommitted, Rank_, size / PageSize); if (Kind_ == EAllocationKind::Tagged) { StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(TMemoryTag)); } #ifdef YTALLOC_NERVOUS StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(ESmallChunkState)); #endif } size_t count = 0; while (oldPtr != newPtr) { UpdateChunkState(oldPtr, ESmallChunkState::Spare, ESmallChunkState::Freed); batch[count] = oldPtr; oldPtr += ChunkSize_; count++; } return count; } void PopulateAnotherExtent() { auto lockGuard = GuardWithTiming(ExtentLock_); auto* currentPtr = CurrentPtr_.load(); auto* currentExtent = CurrentExtent_.load(); if (currentPtr && currentPtr + ChunkSize_ + RightReadableAreaSize <= currentExtent + SmallExtentSize) { // No need for a new extent. return; } auto* newExtent = static_cast(DataZoneAllocator_.Allocate(SmallExtentAllocSize, 0)); AllocateShadowZones(); YTALLOC_VERIFY(reinterpret_cast(newExtent) % SmallExtentAllocSize == 0); CurrentPtr_ = CurrentExtent_ = newExtent; StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesMapped, Rank_, SmallExtentAllocSize); StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesMapped, Rank_, SmallExtentAllocSize / PageSize); } private: const EAllocationKind Kind_; const size_t Rank_; const size_t LogSize_; const size_t ChunkSize_; const uintptr_t DataZoneStart_; TZoneAllocator DataZoneAllocator_; bool ShadowZonesAllocated_ = false; TMemoryTag* MemoryTagZoneStart_; #ifdef YTALLOC_NERVOUS ESmallChunkState* ChunkStateZoneStart_; #endif NThreading::TForkAwareSpinLock ExtentLock_; std::atomic CurrentPtr_ = nullptr; std::atomic CurrentExtent_ = nullptr; size_t GetShadowOffset(const void* ptr) { return (reinterpret_cast(ptr) - DataZoneStart_) >> LogSize_; } void AllocateShadowZones() { if (ShadowZonesAllocated_) { return; } if (Kind_ == EAllocationKind::Tagged) { MemoryTagZoneStart_ = MapShadowZone(); } #ifdef YTALLOC_NERVOUS ChunkStateZoneStart_ = MapShadowZone(); #endif ShadowZonesAllocated_ = true; } template T* MapShadowZone() { auto size = AlignUp((SmallZoneSize >> LogSize_) * sizeof (T), PageSize); auto* ptr = static_cast(MappedMemoryManager->Map(SystemZoneStart, size, MAP_NORESERVE)); MappedMemoryManager->DontDump(ptr, size); return ptr; } }; TExplicitlyConstructableSingleton, SmallRankCount>>> SmallArenaAllocators; //////////////////////////////////////////////////////////////////////////////// constexpr size_t ChunksPerGroup = 128; constexpr size_t GroupsBatchSize = 1024; static_assert(ChunksPerGroup <= MaxCachedChunksPerRank, "ChunksPerGroup > MaxCachedChunksPerRank"); class TChunkGroup : public TFreeListItemBase { public: bool IsEmpty() const { return Size_ == 0; } size_t ExtractAll(void** ptrs) { auto count = Size_; ::memcpy(ptrs, Ptrs_.data(), count * sizeof(void*)); Size_ = 0; return count; } void PutOne(void* ptr) { PutMany(&ptr, 1); } void PutMany(void** ptrs, size_t count) { YTALLOC_PARANOID_ASSERT(Size_ == 0); YTALLOC_PARANOID_ASSERT(count <= ChunksPerGroup); ::memcpy(Ptrs_.data(), ptrs, count * sizeof(void*)); Size_ = count; } private: size_t Size_ = 0; // <= ChunksPerGroup std::array Ptrs_; }; class TGlobalSmallChunkCache { public: explicit TGlobalSmallChunkCache(EAllocationKind kind) : Kind_(kind) { } #ifdef YTALLOC_PARANOID void CanonizeChunkPtrs(TThreadState* state, size_t rank) { auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank]; auto leftBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank]; auto rightBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank]; state->SmallBlobCache[Kind_].CachedChunkFull[rank] = false; if (chunkPtrPtr + 1 == rightBorder) { chunkPtrPtr = leftBorder; state->SmallBlobCache[Kind_].CachedChunkFull[rank] = true; } state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = leftBorder; } #endif bool TryMoveGroupToLocal(TThreadState* state, size_t rank) { auto& groups = RankToChunkGroups_[rank]; auto* group = groups.Extract(state); if (!Y_LIKELY(group)) { return false; } YTALLOC_PARANOID_ASSERT(!group->IsEmpty()); auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank]; #ifdef YTALLOC_PARANOID chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank]; state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = chunkPtrPtr; #endif auto chunkCount = group->ExtractAll(chunkPtrPtr + 1); chunkPtrPtr += chunkCount; #ifdef YTALLOC_PARANOID CanonizeChunkPtrs(state, rank); #endif GroupPool_.Free(state, group); return true; } void MoveGroupToGlobal(TThreadState* state, size_t rank) { auto* group = GroupPool_.Allocate(state); auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank]; YTALLOC_PARANOID_ASSERT(*(chunkPtrPtr + 1) == reinterpret_cast(TThreadState::RightSentinel)); group->PutMany(chunkPtrPtr - ChunksPerGroup + 1, ChunksPerGroup); chunkPtrPtr -= ChunksPerGroup; #ifdef YTALLOC_PARANOID ::memset(chunkPtrPtr + 1, 0, sizeof(void*) * ChunksPerGroup); CanonizeChunkPtrs(state, rank); #endif auto& groups = RankToChunkGroups_[rank]; YTALLOC_PARANOID_ASSERT(!group->IsEmpty()); groups.Put(state, group); } void MoveOneToGlobal(void* ptr, size_t rank) { auto* group = GroupPool_.Allocate(&GlobalShardedState_); group->PutOne(ptr); auto& groups = RankToChunkGroups_[rank]; YTALLOC_PARANOID_ASSERT(!group->IsEmpty()); groups.Put(&GlobalShardedState_, group); } #ifdef YTALLOC_PARANOID void MoveAllToGlobal(TThreadState* state, size_t rank) { auto leftSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank]; auto rightSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank]; auto& headPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank]; auto& tailPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank]; if (tailPtr == headPtr && !state->SmallBlobCache[Kind_].CachedChunkFull[rank]) { headPtr = leftSentinelBorder; return; } // (leftBorder, rightBorder] auto moveIntervalToGlobal = [=] (void** leftBorder, void** rightBorder) { while (true) { size_t count = 0; while (count < ChunksPerGroup && rightBorder != leftBorder) { --rightBorder; ++count; } if (count == 0) { break; } auto* group = GroupPool_.Allocate(state); group->PutMany(rightBorder + 1, count); ::memset(rightBorder + 1, 0, sizeof(void*) * count); auto& groups = RankToChunkGroups_[rank]; groups.Put(state, group); } }; if (tailPtr >= headPtr) { moveIntervalToGlobal(tailPtr, rightSentinelBorder - 1); moveIntervalToGlobal(leftSentinelBorder, headPtr); } else { moveIntervalToGlobal(tailPtr, headPtr); } headPtr = leftSentinelBorder; } #else void MoveAllToGlobal(TThreadState* state, size_t rank) { auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank]; while (true) { size_t count = 0; while (count < ChunksPerGroup && *chunkPtrPtr != reinterpret_cast(TThreadState::LeftSentinel)) { --chunkPtrPtr; ++count; } if (count == 0) { break; } auto* group = GroupPool_.Allocate(state); group->PutMany(chunkPtrPtr + 1, count); auto& groups = RankToChunkGroups_[rank]; groups.Put(state, group); } } #endif private: const EAllocationKind Kind_; TGlobalShardedState GlobalShardedState_; TShardedSystemPool GroupPool_; std::array, SmallRankCount> RankToChunkGroups_; }; TExplicitlyConstructableSingleton>> GlobalSmallChunkCaches; //////////////////////////////////////////////////////////////////////////////// class TSmallAllocator { public: template static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank) { auto* state = TThreadManager::FindThreadState(); if (Y_LIKELY(state)) { return Allocate(tag, rank, state); } auto size = SmallRankToSize[rank]; return AllocateGlobal(tag, rank, size); } #ifdef YTALLOC_PARANOID template static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state) { auto& localCache = state->SmallBlobCache[Kind]; auto& allocator = *(*SmallArenaAllocators)[Kind][rank]; size_t size = SmallRankToSize[rank]; StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size); auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank]; auto rightBorder = localCache.RankToCachedChunkRightBorder[rank]; void* result; while (true) { auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank]; auto& cachedHeadPtr = *(chunkHeadPtr + 1); auto* headPtr = cachedHeadPtr; auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank]; auto& cachedTailPtr = *(chunkTailPtr + 1); auto* tailPtr = cachedTailPtr; auto& chunkFull = localCache.CachedChunkFull[rank]; if (Y_LIKELY(chunkFull || headPtr != tailPtr)) { YTALLOC_PARANOID_ASSERT(tailPtr); cachedTailPtr = nullptr; ++chunkTailPtr; if (Y_LIKELY(chunkTailPtr + 1 == rightBorder)) { chunkTailPtr = leftBorder; } chunkFull = false; result = tailPtr; PoisonUninitializedRange(result, size); allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated); break; } auto& globalCache = *(*GlobalSmallChunkCaches)[Kind]; if (!globalCache.TryMoveGroupToLocal(state, rank)) { result = allocator.Allocate(size); break; } } if constexpr(Kind == EAllocationKind::Tagged) { allocator.SetMemoryTag(result, tag); } return result; } template static Y_FORCE_INLINE void Free(void* ptr) { auto rank = PtrToSmallRank(ptr); auto size = SmallRankToSize[rank]; auto& allocator = *(*SmallArenaAllocators)[Kind][rank]; auto tag = NullMemoryTag; if constexpr(Kind == EAllocationKind::Tagged) { tag = allocator.GetAndResetMemoryTag(ptr); YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag); } allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed); PoisonFreedRange(ptr, size); auto* state = TThreadManager::FindThreadState(); if (Y_UNLIKELY(!state)) { FreeGlobal(tag, ptr, rank, size); return; } StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size); auto& localCache = state->SmallBlobCache[Kind]; auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank]; auto rightBorder = localCache.RankToCachedChunkRightBorder[rank]; while (true) { auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank]; auto& headPtr = *(chunkHeadPtr + 1); auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank]; auto& chunkFull = localCache.CachedChunkFull[rank]; if (Y_LIKELY(!chunkFull)) { headPtr = ptr; ++chunkHeadPtr; if (Y_LIKELY(chunkHeadPtr + 1 == rightBorder)) { chunkHeadPtr = leftBorder; } chunkFull = (chunkHeadPtr == chunkTailPtr); break; } chunkHeadPtr = rightBorder - 1; chunkTailPtr = leftBorder; auto& globalCache = *(*GlobalSmallChunkCaches)[Kind]; globalCache.MoveGroupToGlobal(state, rank); } } #else template static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state) { size_t size = SmallRankToSize[rank]; StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size); auto& localCache = state->SmallBlobCache[Kind]; auto& allocator = *(*SmallArenaAllocators)[Kind][rank]; void* result; while (true) { auto& chunkPtr = localCache.RankToCachedChunkPtrHead[rank]; auto& cachedPtr = *chunkPtr; auto* ptr = cachedPtr; if (Y_LIKELY(ptr != reinterpret_cast(TThreadState::LeftSentinel))) { --chunkPtr; result = ptr; allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated); PoisonUninitializedRange(result, size); break; } auto& globalCache = *(*GlobalSmallChunkCaches)[Kind]; if (globalCache.TryMoveGroupToLocal(state, rank)) { continue; } auto count = allocator.PullMany( chunkPtr + 1, SmallRankBatchSize[rank]); chunkPtr += count; } if constexpr(Kind == EAllocationKind::Tagged) { allocator.SetMemoryTag(result, tag); } return result; } template static Y_FORCE_INLINE void Free(void* ptr) { auto rank = PtrToSmallRank(ptr); auto size = SmallRankToSize[rank]; auto& allocator = *(*SmallArenaAllocators)[Kind][rank]; auto tag = NullMemoryTag; if constexpr(Kind == EAllocationKind::Tagged) { tag = allocator.GetAndResetMemoryTag(ptr); YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag); } allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed); PoisonFreedRange(ptr, size); auto* state = TThreadManager::FindThreadState(); if (Y_UNLIKELY(!state)) { FreeGlobal(tag, ptr, rank, size); return; } StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size); auto& localCache = state->SmallBlobCache[Kind]; while (true) { auto& chunkPtrPtr = localCache.RankToCachedChunkPtrHead[rank]; auto& chunkPtr = *(chunkPtrPtr + 1); if (Y_LIKELY(chunkPtr != reinterpret_cast(TThreadState::RightSentinel))) { chunkPtr = ptr; ++chunkPtrPtr; break; } auto& globalCache = *(*GlobalSmallChunkCaches)[Kind]; globalCache.MoveGroupToGlobal(state, rank); } } #endif static size_t GetAllocationSize(const void* ptr) { return SmallRankToSize[PtrToSmallRank(ptr)]; } static size_t GetAllocationSize(size_t size) { return SmallRankToSize[SizeToSmallRank(size)]; } static void PurgeCaches() { DoPurgeCaches(); DoPurgeCaches(); } private: template static void DoPurgeCaches() { auto* state = TThreadManager::GetThreadStateChecked(); for (size_t rank = 0; rank < SmallRankCount; ++rank) { (*GlobalSmallChunkCaches)[Kind]->MoveAllToGlobal(state, rank); } } template static void* AllocateGlobal(TMemoryTag tag, size_t rank, size_t size) { StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size); auto& allocator = *(*SmallArenaAllocators)[Kind][rank]; auto* result = allocator.Allocate(size); if constexpr(Kind == EAllocationKind::Tagged) { allocator.SetMemoryTag(result, tag); } return result; } template static void FreeGlobal(TMemoryTag tag, void* ptr, size_t rank, size_t size) { StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size); auto& globalCache = *(*GlobalSmallChunkCaches)[Kind]; globalCache.MoveOneToGlobal(ptr, rank); } }; //////////////////////////////////////////////////////////////////////////////// // Large blob allocator // // Like for small chunks, large blobs are grouped into arenas, where arena K handles // blobs of size (2^{K-1},2^K]. Memory is mapped in extents of LargeExtentSize bytes. // Each extent is split into segments of size 2^K (here segment is just a memory region, which may fully consist of // unmapped pages). When a segment is actually allocated, it becomes a blob and a TLargeBlobHeader // structure is placed at its start. // // When an extent is allocated, it is sliced into segments (not blobs, since no headers are placed and // no memory is touched). These segments are put into disposed segments list. // // For each blob two separate sizes are maintained: BytesAcquired indicates the number of bytes // acquired via madvise(MADV_POPULATE) from the system; BytesAllocated (<= BytesAcquired) corresponds // to the number of bytes claimed by the user (including the header and page size alignment). // If BytesAllocated == 0 then this blob is spare, i.e. // was freed and remains cached for further possible reuse. // // When a new blob is being allocated, the allocator first tries to extract a spare blob. On success, // its acquired size is extended (if needed); the acquired size never shrinks on allocation. // If no spare blobs exist, a disposed segment is extracted and is turned into a blob (i.e. // its header is initialized) and the needed number of bytes is acquired. If no disposed segments // exist, then a new extent is allocated and sliced into segments. // // The above algorithm only claims memory from the system (by means of madvise(MADV_POPULATE)); // the reclaim is handled by a separate background mechanism. Two types of reclaimable memory // regions are possible: // * spare: these correspond to spare blobs; upon reclaiming this region becomes a disposed segment // * overhead: these correspond to trailing parts of allocated blobs in [BytesAllocated, BytesAcquired) byte range // // Reclaiming spare blobs is easy as these are explicitly tracked by spare blob lists. To reclaim, // we atomically extract a blob from a spare list, call madvise(MADV_FREE), and put the pointer to // the disposed segment list. // // Reclaiming overheads is more complicated since (a) allocated blobs are never tracked directly and // (b) reclaiming them may interfere with Allocate and Free. // // To overcome (a), for each extent we maintain a bitmap marking segments that are actually blobs // (i.e. contain a header). (For simplicity and efficiency this bitmap is just a vector of bytes.) // These flags are updated in Allocate/Free with appropriate memory ordering. Note that // blobs are only disposed (and are turned into segments) by the background thread; if this // thread discovers a segment that is marked as a blob, then it is safe to assume that this segment // remains a blob unless the thread disposes it. // // To overcome (b), each large blob header maintains a spin lock. When blob B is extracted // from a spare list in Allocate, an acquisition is tried. If successful, B is returned to the // user. Otherwise it is assumed that B is currently being examined by the background // reclaimer thread. Allocate then skips this blob and retries extraction; the problem is that // since the spare list is basically a stack one cannot just push B back into the spare list. // Instead, B is pushed into a special locked spare list. This list is purged by the background // thread on each tick and its items are pushed back into the usual spare list. // // A similar trick is used by Free: when invoked for blob B its spin lock acquisition is first // tried. Upon success, B is moved to the spare list. On failure, Free has to postpone this deallocation // by moving B into the freed locked list. This list, similarly, is being purged by the background thread. // // It remains to explain how the background thread computes the number of bytes to be reclaimed from // each arena. To this aim, we first compute the total number of reclaimable bytes. // This is the sum of spare and overhead bytes in all arenas minus the number of unreclaimable bytes // The latter grows linearly in the number of used bytes and is capped from below by a MinUnreclaimableLargeBytes; // and from above by MaxUnreclaimableLargeBytes. SetLargeUnreclaimableCoeff and Set(Min|Max)LargeUnreclaimableBytes // enable tuning these control knobs. The reclaimable bytes are being taken from arenas starting from those // with the largest spare and overhead volumes. // // The above implies that each large blob contains a fixed-size header preceeding it. // Hence ptr % PageSize == sizeof (TLargeBlobHeader) for each ptr returned by Allocate // (since large blob sizes are larger than PageSize and are divisible by PageSize). // For AllocatePageAligned, however, ptr must be divisible by PageSize. To handle such an allocation, we // artificially increase its size and align the result of Allocate up to the next page boundary. // When handling a deallocation, ptr is moved back by UnalignPtr (which is capable of dealing // with both the results of Allocate and AllocatePageAligned). // This technique applies to both large and huge blobs. enum ELargeBlobState : ui64 { Allocated = 0x6c6c61656772616cULL, // largeall Spare = 0x727073656772616cULL, // largespr LockedSpare = 0x70736c656772616cULL, // largelsp LockedFreed = 0x72666c656772616cULL // largelfr }; // Every large blob (either tagged or not) is prepended with this header. struct TLargeBlobHeader : public TFreeListItemBase { TLargeBlobHeader( TLargeBlobExtent* extent, size_t bytesAcquired, size_t bytesAllocated, TMemoryTag tag) : Extent(extent) , BytesAcquired(bytesAcquired) , Tag(tag) , BytesAllocated(bytesAllocated) , State(ELargeBlobState::Allocated) { } TLargeBlobExtent* Extent; // Number of bytes in all acquired pages. size_t BytesAcquired; std::atomic Locked = false; TMemoryTag Tag = NullMemoryTag; // For spare blobs this is zero. // For allocated blobs this is the number of bytes requested by user (not including header of any alignment). size_t BytesAllocated; ELargeBlobState State; char Padding[12]; }; CHECK_HEADER_ALIGNMENT(TLargeBlobHeader) struct TLargeBlobExtent { TLargeBlobExtent(size_t segmentCount, char* ptr) : SegmentCount(segmentCount) , Ptr(ptr) { } size_t SegmentCount; char* Ptr; TLargeBlobExtent* NextExtent = nullptr; std::atomic DisposedFlags[0]; }; // A helper node that enables storing a number of extent's segments // in a free list. Recall that segments themselves do not posses any headers. struct TDisposedSegment : public TFreeListItemBase { size_t Index; TLargeBlobExtent* Extent; }; struct TLargeArena { size_t Rank = 0; size_t SegmentSize = 0; TShardedFreeList SpareBlobs; TFreeList LockedSpareBlobs; TFreeList LockedFreedBlobs; TFreeList DisposedSegments; std::atomic FirstExtent = nullptr; TLargeBlobExtent* CurrentOverheadScanExtent = nullptr; size_t CurrentOverheadScanSegment = 0; }; template class TLargeBlobAllocator { public: TLargeBlobAllocator() : ZoneAllocator_(LargeZoneStart(Dumpable), LargeZoneEnd(Dumpable)) { for (size_t rank = 0; rank < Arenas_.size(); ++rank) { auto& arena = Arenas_[rank]; arena.Rank = rank; arena.SegmentSize = (1ULL << rank); } } void* Allocate(size_t size) { auto* state = TThreadManager::FindThreadState(); return Y_LIKELY(state) ? DoAllocate(state, size) : DoAllocate(GlobalState.Get(), size); } void Free(void* ptr) { auto* state = TThreadManager::FindThreadState(); if (Y_LIKELY(state)) { DoFree(state, ptr); } else { DoFree(GlobalState.Get(), ptr); } } static size_t GetAllocationSize(const void* ptr) { UnalignPtr(ptr); const auto* blob = PtrToHeader(ptr); return blob->BytesAllocated; } static size_t GetAllocationSize(size_t size) { return GetBlobAllocationSize(size); } void RunBackgroundTasks() { ReinstallLockedBlobs(); ReclaimMemory(); } void SetBacktraceProvider(TBacktraceProvider provider) { BacktraceProvider_.store(provider); } private: template void PopulateArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size) { MappedMemoryManager->Populate(ptr, size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesPopulated, size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesPopulated, size / PageSize); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, size / PageSize); } template void ReleaseArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size) { MappedMemoryManager->Release(ptr, size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesReleased, size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesReleased, size / PageSize); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, -size); StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, -size / PageSize); } bool TryLockBlob(TLargeBlobHeader* blob) { bool expected = false; return blob->Locked.compare_exchange_strong(expected, true); } void UnlockBlob(TLargeBlobHeader* blob) { blob->Locked.store(false); } template void MoveBlobToSpare(TState* state, TLargeArena* arena, TLargeBlobHeader* blob, bool unlock) { auto rank = arena->Rank; auto size = blob->BytesAllocated; auto rawSize = GetRawBlobSize(size); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, blob->BytesAcquired); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -(blob->BytesAcquired - rawSize)); blob->BytesAllocated = 0; if (unlock) { UnlockBlob(blob); } else { YTALLOC_VERIFY(!blob->Locked.load()); } blob->State = ELargeBlobState::Spare; arena->SpareBlobs.Put(state, blob); } size_t GetBytesToReclaim(const std::array& arenaCounters) { size_t totalBytesAllocated = 0; size_t totalBytesFreed = 0; size_t totalBytesSpare = 0; size_t totalBytesOverhead = 0; for (size_t rank = 0; rank < Arenas_.size(); ++rank) { const auto& counters = arenaCounters[rank]; totalBytesAllocated += counters[ELargeArenaCounter::BytesAllocated]; totalBytesFreed += counters[ELargeArenaCounter::BytesFreed]; totalBytesSpare += counters[ELargeArenaCounter::BytesSpare]; totalBytesOverhead += counters[ELargeArenaCounter::BytesOverhead]; } auto totalBytesUsed = totalBytesAllocated - totalBytesFreed; auto totalBytesReclaimable = totalBytesSpare + totalBytesOverhead; auto threshold = ClampVal( static_cast(ConfigurationManager->GetLargeUnreclaimableCoeff() * totalBytesUsed), ConfigurationManager->GetMinLargeUnreclaimableBytes(), ConfigurationManager->GetMaxLargeUnreclaimableBytes()); if (totalBytesReclaimable < threshold) { return 0; } auto bytesToReclaim = totalBytesReclaimable - threshold; return AlignUp(bytesToReclaim, PageSize); } void ReinstallLockedSpareBlobs(TLargeArena* arena) { auto* blob = arena->LockedSpareBlobs.ExtractAll(); auto* state = TThreadManager::GetThreadStateChecked(); size_t count = 0; while (blob) { auto* nextBlob = blob->Next.load(); YTALLOC_VERIFY(!blob->Locked.load()); AssertBlobState(blob, ELargeBlobState::LockedSpare); blob->State = ELargeBlobState::Spare; arena->SpareBlobs.Put(state, blob); blob = nextBlob; ++count; } if (count > 0) { YTALLOC_LOG_DEBUG("Locked spare blobs reinstalled (Rank: %d, Blobs: %zu)", arena->Rank, count); } } void ReinstallLockedFreedBlobs(TLargeArena* arena) { auto* state = TThreadManager::GetThreadStateChecked(); auto* blob = arena->LockedFreedBlobs.ExtractAll(); size_t count = 0; while (blob) { auto* nextBlob = blob->Next.load(); AssertBlobState(blob, ELargeBlobState::LockedFreed); MoveBlobToSpare(state, arena, blob, false); ++count; blob = nextBlob; } if (count > 0) { YTALLOC_LOG_DEBUG("Locked freed blobs reinstalled (Rank: %d, Blobs: %zu)", arena->Rank, count); } } void ReclaimSpareMemory(TLargeArena* arena, ssize_t bytesToReclaim) { if (bytesToReclaim <= 0) { return; } auto rank = arena->Rank; auto* state = TThreadManager::GetThreadStateChecked(); YTALLOC_LOG_DEBUG("Started processing spare memory in arena (BytesToReclaim: %zdM, Rank: %d)", bytesToReclaim / 1_MB, rank); size_t bytesReclaimed = 0; size_t blobsReclaimed = 0; while (bytesToReclaim > 0) { auto* blob = arena->SpareBlobs.ExtractRoundRobin(state); if (!blob) { break; } AssertBlobState(blob, ELargeBlobState::Spare); YTALLOC_VERIFY(blob->BytesAllocated == 0); auto bytesAcquired = blob->BytesAcquired; StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -bytesAcquired); bytesToReclaim -= bytesAcquired; bytesReclaimed += bytesAcquired; blobsReclaimed += 1; auto* extent = blob->Extent; auto* ptr = reinterpret_cast(blob); ReleaseArenaPages( state, arena, ptr, bytesAcquired); size_t segmentIndex = (ptr - extent->Ptr) / arena->SegmentSize; extent->DisposedFlags[segmentIndex].store(true, std::memory_order_relaxed); auto* disposedSegment = DisposedSegmentPool_.Allocate(); disposedSegment->Index = segmentIndex; disposedSegment->Extent = extent; arena->DisposedSegments.Put(disposedSegment); } StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::SpareBytesReclaimed, bytesReclaimed); YTALLOC_LOG_DEBUG("Finished processing spare memory in arena (Rank: %d, BytesReclaimed: %zdM, BlobsReclaimed: %zu)", arena->Rank, bytesReclaimed / 1_MB, blobsReclaimed); } void ReclaimOverheadMemory(TLargeArena* arena, ssize_t bytesToReclaim) { if (bytesToReclaim == 0) { return; } auto* state = TThreadManager::GetThreadStateChecked(); auto rank = arena->Rank; YTALLOC_LOG_DEBUG("Started processing overhead memory in arena (BytesToReclaim: %zdM, Rank: %d)", bytesToReclaim / 1_MB, rank); size_t extentsTraversed = 0; size_t segmentsTraversed = 0; size_t bytesReclaimed = 0; bool restartedFromFirstExtent = false; auto& currentExtent = arena->CurrentOverheadScanExtent; auto& currentSegment = arena->CurrentOverheadScanSegment; while (bytesToReclaim > 0) { if (!currentExtent) { if (restartedFromFirstExtent) { break; } currentExtent = arena->FirstExtent.load(); if (!currentExtent) { break; } restartedFromFirstExtent = true; } while (currentSegment < currentExtent->SegmentCount && bytesToReclaim > 0) { ++segmentsTraversed; if (!currentExtent->DisposedFlags[currentSegment].load(std::memory_order_acquire)) { auto* ptr = currentExtent->Ptr + currentSegment * arena->SegmentSize; auto* blob = reinterpret_cast(ptr); YTALLOC_PARANOID_ASSERT(blob->Extent == currentExtent); if (TryLockBlob(blob)) { if (blob->BytesAllocated > 0) { size_t rawSize = GetRawBlobSize(blob->BytesAllocated); size_t bytesToRelease = blob->BytesAcquired - rawSize; if (bytesToRelease > 0) { ReleaseArenaPages( state, arena, ptr + blob->BytesAcquired - bytesToRelease, bytesToRelease); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -bytesToRelease); blob->BytesAcquired = rawSize; bytesToReclaim -= bytesToRelease; bytesReclaimed += bytesToRelease; } } UnlockBlob(blob); } } ++currentSegment; } ++extentsTraversed; currentSegment = 0; currentExtent = currentExtent->NextExtent; } StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::OverheadBytesReclaimed, bytesReclaimed); YTALLOC_LOG_DEBUG("Finished processing overhead memory in arena (Rank: %d, Extents: %zu, Segments: %zu, BytesReclaimed: %zuM)", arena->Rank, extentsTraversed, segmentsTraversed, bytesReclaimed / 1_MB); } void ReinstallLockedBlobs() { for (auto& arena : Arenas_) { ReinstallLockedSpareBlobs(&arena); ReinstallLockedFreedBlobs(&arena); } } void ReclaimMemory() { auto arenaCounters = StatisticsManager->GetLargeArenaAllocationCounters(); ssize_t bytesToReclaim = GetBytesToReclaim(arenaCounters); if (bytesToReclaim == 0) { return; } YTALLOC_LOG_DEBUG("Memory reclaim started (BytesToReclaim: %zdM)", bytesToReclaim / 1_MB); std::array bytesReclaimablePerArena; for (size_t rank = 0; rank < LargeRankCount; ++rank) { bytesReclaimablePerArena[rank * 2] = arenaCounters[rank][ELargeArenaCounter::BytesOverhead]; bytesReclaimablePerArena[rank * 2 + 1] = arenaCounters[rank][ELargeArenaCounter::BytesSpare]; } std::array bytesToReclaimPerArena{}; while (bytesToReclaim > 0) { ssize_t maxBytes = std::numeric_limits::min(); int maxIndex = -1; for (int index = 0; index < LargeRankCount * 2; ++index) { if (bytesReclaimablePerArena[index] > maxBytes) { maxBytes = bytesReclaimablePerArena[index]; maxIndex = index; } } if (maxIndex < 0) { break; } auto bytesToReclaimPerStep = std::min({bytesToReclaim, maxBytes, 4_MB}); if (bytesToReclaimPerStep < 0) { break; } bytesToReclaimPerArena[maxIndex] += bytesToReclaimPerStep; bytesReclaimablePerArena[maxIndex] -= bytesToReclaimPerStep; bytesToReclaim -= bytesToReclaimPerStep; } for (auto& arena : Arenas_) { auto rank = arena.Rank; ReclaimOverheadMemory(&arena, bytesToReclaimPerArena[rank * 2]); ReclaimSpareMemory(&arena, bytesToReclaimPerArena[rank * 2 + 1]); } YTALLOC_LOG_DEBUG("Memory reclaim finished"); } template void AllocateArenaExtent(TState* state, TLargeArena* arena) { auto rank = arena->Rank; StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::ExtentsAllocated, 1); size_t segmentCount = LargeExtentSize / arena->SegmentSize; size_t extentHeaderSize = AlignUp(sizeof (TLargeBlobExtent) + sizeof (TLargeBlobExtent::DisposedFlags[0]) * segmentCount, PageSize); size_t allocationSize = extentHeaderSize + LargeExtentSize; auto* ptr = ZoneAllocator_.Allocate(allocationSize, MAP_NORESERVE); if (!Dumpable) { MappedMemoryManager->DontDump(ptr, allocationSize); } if (auto backtraceProvider = BacktraceProvider_.load()) { std::array frames; auto frameCount = backtraceProvider( frames.data(), MaxAllocationProfilingBacktraceDepth, 3); MmapObservationManager->EnqueueEvent(allocationSize, frames, frameCount); } StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesMapped, allocationSize); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesMapped, allocationSize / PageSize); auto* extent = static_cast(ptr); MappedMemoryManager->Populate(ptr, extentHeaderSize); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesPopulated, extentHeaderSize); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesPopulated, extentHeaderSize / PageSize); StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, extentHeaderSize); new (extent) TLargeBlobExtent(segmentCount, static_cast(ptr) + extentHeaderSize); for (size_t index = 0; index < segmentCount; ++index) { auto* disposedSegment = DisposedSegmentPool_.Allocate(); disposedSegment->Index = index; disposedSegment->Extent = extent; arena->DisposedSegments.Put(disposedSegment); extent->DisposedFlags[index].store(true); } auto* expectedFirstExtent = arena->FirstExtent.load(); do { extent->NextExtent = expectedFirstExtent; } while (Y_UNLIKELY(!arena->FirstExtent.compare_exchange_weak(expectedFirstExtent, extent))); } template void* DoAllocate(TState* state, size_t size) { auto rawSize = GetRawBlobSize(size); auto rank = GetLargeRank(rawSize); auto tag = ConfigurationManager->IsLargeArenaAllocationProfiled(rank) ? BacktraceManager->GetMemoryTagFromBacktrace(3) : TThreadManager::GetCurrentMemoryTag(); auto& arena = Arenas_[rank]; YTALLOC_PARANOID_ASSERT(rawSize <= arena.SegmentSize); TLargeBlobHeader* blob; while (true) { blob = arena.SpareBlobs.Extract(state); if (blob) { AssertBlobState(blob, ELargeBlobState::Spare); if (TryLockBlob(blob)) { StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -blob->BytesAcquired); if (blob->BytesAcquired < rawSize) { PopulateArenaPages( state, &arena, reinterpret_cast(blob) + blob->BytesAcquired, rawSize - blob->BytesAcquired); blob->BytesAcquired = rawSize; } else { StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, blob->BytesAcquired - rawSize); } YTALLOC_PARANOID_ASSERT(blob->BytesAllocated == 0); blob->BytesAllocated = size; blob->Tag = tag; blob->State = ELargeBlobState::Allocated; UnlockBlob(blob); break; } else { blob->State = ELargeBlobState::LockedSpare; arena.LockedSpareBlobs.Put(blob); } } auto* disposedSegment = arena.DisposedSegments.Extract(); if (disposedSegment) { auto index = disposedSegment->Index; auto* extent = disposedSegment->Extent; DisposedSegmentPool_.Free(disposedSegment); auto* ptr = extent->Ptr + index * arena.SegmentSize; PopulateArenaPages( state, &arena, ptr, rawSize); blob = reinterpret_cast(ptr); new (blob) TLargeBlobHeader(extent, rawSize, size, tag); extent->DisposedFlags[index].store(false, std::memory_order_release); break; } AllocateArenaExtent(state, &arena); } StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsAllocated, 1); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesAllocated, size); StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size); if (!Dumpable) { StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesAllocated, size); } auto* result = HeaderToPtr(blob); YTALLOC_PARANOID_ASSERT(reinterpret_cast(result) >= LargeZoneStart(Dumpable) && reinterpret_cast(result) < LargeZoneEnd(Dumpable)); PoisonUninitializedRange(result, size); return result; } template void DoFree(TState* state, void* ptr) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= LargeZoneStart(Dumpable) && reinterpret_cast(ptr) < LargeZoneEnd(Dumpable)); auto* blob = PtrToHeader(ptr); AssertBlobState(blob, ELargeBlobState::Allocated); auto size = blob->BytesAllocated; PoisonFreedRange(ptr, size); auto rawSize = GetRawBlobSize(size); auto rank = GetLargeRank(rawSize); auto& arena = Arenas_[rank]; YTALLOC_PARANOID_ASSERT(blob->BytesAcquired <= arena.SegmentSize); auto tag = blob->Tag; StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsFreed, 1); StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesFreed, size); StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size); if (!Dumpable) { StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesFreed, size); } if (TryLockBlob(blob)) { MoveBlobToSpare(state, &arena, blob, true); } else { blob->State = ELargeBlobState::LockedFreed; arena.LockedFreedBlobs.Put(blob); } } private: TZoneAllocator ZoneAllocator_; std::array Arenas_; static constexpr size_t DisposedSegmentsBatchSize = 1024; TSystemPool DisposedSegmentPool_; std::atomic BacktraceProvider_ = nullptr; }; TExplicitlyConstructableSingleton> DumpableLargeBlobAllocator; TExplicitlyConstructableSingleton> UndumpableLargeBlobAllocator; //////////////////////////////////////////////////////////////////////////////// // Huge blob allocator // // Basically a wrapper for TZoneAllocator. // Acts as a signature to detect broken headers. enum class EHugeBlobState : ui64 { Allocated = 0x72666c656772616cULL // hugeallc }; // Every huge blob (both tagged or not) is prepended with this header. struct THugeBlobHeader { THugeBlobHeader(TMemoryTag tag, size_t size, bool dumpable) : Tag(tag) , Size(size) , State(EHugeBlobState::Allocated) , Dumpable(dumpable) { } TMemoryTag Tag; size_t Size; EHugeBlobState State; bool Dumpable; char Padding[7]; }; CHECK_HEADER_ALIGNMENT(THugeBlobHeader) class THugeBlobAllocator { public: THugeBlobAllocator() : ZoneAllocator_(HugeZoneStart, HugeZoneEnd) { } void* Allocate(size_t size, bool dumpable) { YTALLOC_VERIFY(size <= MaxAllocationSize); auto tag = TThreadManager::GetCurrentMemoryTag(); auto rawSize = GetRawBlobSize(size); auto* blob = static_cast(ZoneAllocator_.Allocate(rawSize, MAP_POPULATE)); if (!dumpable) { MappedMemoryManager->DontDump(blob, rawSize); } new (blob) THugeBlobHeader(tag, size, dumpable); StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size); StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsAllocated, 1); StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesAllocated, size); if (!dumpable) { StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesAllocated, size); } auto* result = HeaderToPtr(blob); PoisonUninitializedRange(result, size); return result; } void Free(void* ptr) { auto* blob = PtrToHeader(ptr); AssertBlobState(blob, EHugeBlobState::Allocated); auto tag = blob->Tag; auto size = blob->Size; auto dumpable = blob->Dumpable; PoisonFreedRange(ptr, size); auto rawSize = GetRawBlobSize(size); ZoneAllocator_.Free(blob, rawSize); StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size); StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsFreed, 1); StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesFreed, size); if (!dumpable) { StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesFreed, size); } } static size_t GetAllocationSize(const void* ptr) { UnalignPtr(ptr); const auto* blob = PtrToHeader(ptr); return blob->Size; } static size_t GetAllocationSize(size_t size) { return GetBlobAllocationSize(size); } private: TZoneAllocator ZoneAllocator_; }; TExplicitlyConstructableSingleton HugeBlobAllocator; //////////////////////////////////////////////////////////////////////////////// // A thunk to large and huge blob allocators class TBlobAllocator { public: static void* Allocate(size_t size) { InitializeGlobals(); bool dumpable = GetCurrentMemoryZone() != EMemoryZone::Undumpable; // NB: Account for the header. Also note that we may safely ignore the alignment since // HugeAllocationSizeThreshold is already page-aligned. if (Y_LIKELY(size < HugeAllocationSizeThreshold - sizeof(TLargeBlobHeader) - RightReadableAreaSize)) { void* result = dumpable ? DumpableLargeBlobAllocator->Allocate(size) : UndumpableLargeBlobAllocator->Allocate(size); YTALLOC_PARANOID_ASSERT(reinterpret_cast(result) >= LargeZoneStart(dumpable) && reinterpret_cast(result) < LargeZoneEnd(dumpable)); return result; } else { auto* result = HugeBlobAllocator->Allocate(size, dumpable); YTALLOC_PARANOID_ASSERT(reinterpret_cast(result) >= HugeZoneStart && reinterpret_cast(result) < HugeZoneEnd); return result; } } static void Free(void* ptr) { InitializeGlobals(); if (reinterpret_cast(ptr) < LargeZoneEnd(true)) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= LargeZoneStart(true) && reinterpret_cast(ptr) < LargeZoneEnd(true)); UnalignPtr(ptr); DumpableLargeBlobAllocator->Free(ptr); } else if (reinterpret_cast(ptr) < LargeZoneEnd(false)) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= LargeZoneStart(false) && reinterpret_cast(ptr) < LargeZoneEnd(false)); UnalignPtr(ptr); UndumpableLargeBlobAllocator->Free(ptr); } else if (reinterpret_cast(ptr) < HugeZoneEnd) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= HugeZoneStart && reinterpret_cast(ptr) < HugeZoneEnd); UnalignPtr(ptr); HugeBlobAllocator->Free(ptr); } else { YTALLOC_TRAP("Wrong ptr passed to Free"); } } }; //////////////////////////////////////////////////////////////////////////////// Y_POD_THREAD(bool) CurrentThreadIsBackground; // Base class for all background threads. template class TBackgroundThreadBase { public: TBackgroundThreadBase() : State_(new TState()) { NThreading::RegisterAtForkHandlers( [=] { BeforeFork(); }, [=] { AfterForkParent(); }, [=] { AfterForkChild(); }); } virtual ~TBackgroundThreadBase() { Stop(); } private: struct TState : public TSystemAllocatable { std::mutex StartStopMutex; std::optional Thread; std::mutex StopFlagMutex; std::condition_variable StopFlagVariable; std::chrono::system_clock::time_point LastInvocationTime; bool StopFlag = false; bool Paused = false; std::atomic ForkDepth = 0; bool RestartAfterFork = false; }; TState* State_; private: void BeforeFork() { bool stopped = Stop(); if (State_->ForkDepth++ == 0) { State_->RestartAfterFork = stopped; } } void AfterForkParent() { if (--State_->ForkDepth == 0) { if (State_->RestartAfterFork) { Start(false); } } } void AfterForkChild() { bool restart = State_->RestartAfterFork; State_ = new TState(); if (restart) { Start(false); } } virtual void ThreadMain() = 0; protected: void Start(bool fromAlloc) { std::unique_lock guard(State_->StartStopMutex, std::defer_lock); if (fromAlloc) { if (!guard.try_lock()) { return; } if (State_->Paused) { return; } } else { guard.lock(); } State_->Paused = false; if (State_->Thread) { return; } State_->StopFlag = false; State_->Thread.emplace([=] { CurrentThreadIsBackground = true; ThreadMain(); }); OnStart(); } bool Stop() { std::unique_lock guard(State_->StartStopMutex); State_->Paused = true; if (!State_->Thread) { return false; } std::unique_lock flagGuard(State_->StopFlagMutex); State_->StopFlag = true; flagGuard.unlock(); State_->StopFlagVariable.notify_one(); State_->Thread->join(); State_->Thread.reset(); OnStop(); return true; } bool IsDone(TDuration interval) { std::unique_lock flagGuard(State_->StopFlagMutex); auto result = State_->StopFlagVariable.wait_until( flagGuard, State_->LastInvocationTime + std::chrono::microseconds(interval.MicroSeconds()), [&] { return State_->StopFlag; }); State_->LastInvocationTime = std::chrono::system_clock::now(); return result; } virtual void OnStart() { } virtual void OnStop() { } }; //////////////////////////////////////////////////////////////////////////////// // Invokes madvise(MADV_STOCKPILE) periodically. class TStockpileThread : public TBackgroundThreadBase { public: explicit TStockpileThread(int index) : Index_(index) { Start(false); } private: const int Index_; virtual void ThreadMain() override { TThread::SetCurrentThreadName(Sprintf("%s:%d", StockpileThreadName, Index_).c_str()); while (!IsDone(ConfigurationManager->GetStockpileInterval())) { if (!MappedMemoryManager->Stockpile(ConfigurationManager->GetStockpileSize())) { // No use to proceed. YTALLOC_LOG_INFO("Stockpile call failed; terminating stockpile thread"); break; } } } }; // Manages a bunch of TStockpileThreads. class TStockpileManager { public: void SpawnIfNeeded() { if (!ConfigurationManager->IsStockpileEnabled()) { return; } int threadCount = ConfigurationManager->GetStockpileThreadCount(); while (static_cast(Threads_.size()) > threadCount) { Threads_.pop_back(); } while (static_cast(Threads_.size()) < threadCount) { Threads_.push_back(std::make_unique(static_cast(Threads_.size()))); } } private: std::vector> Threads_; }; TExplicitlyConstructableSingleton StockpileManager; //////////////////////////////////////////////////////////////////////////////// // Time to wait before re-spawning the thread after a fork. static constexpr auto BackgroundThreadRespawnDelay = TDuration::Seconds(3); // Runs basic background activities: reclaim, logging, profiling etc. class TBackgroundThread : public TBackgroundThreadBase { public: bool IsStarted() { return Started_.load(); } void SpawnIfNeeded() { if (CurrentThreadIsBackground) { return; } Start(true); } private: std::atomic Started_ = false; private: virtual void ThreadMain() override { TThread::SetCurrentThreadName(BackgroundThreadName); TimingManager->DisableForCurrentThread(); MmapObservationManager->DisableForCurrentThread(); while (!IsDone(BackgroundInterval)) { DumpableLargeBlobAllocator->RunBackgroundTasks(); UndumpableLargeBlobAllocator->RunBackgroundTasks(); MappedMemoryManager->RunBackgroundTasks(); TimingManager->RunBackgroundTasks(); MmapObservationManager->RunBackgroundTasks(); StockpileManager->SpawnIfNeeded(); } } virtual void OnStart() override { DoUpdateAllThreadsControlWord(true); } virtual void OnStop() override { DoUpdateAllThreadsControlWord(false); } void DoUpdateAllThreadsControlWord(bool started) { // Update threads' TLS. ThreadManager->EnumerateThreadStatesSync( [&] { Started_.store(started); }, [&] (auto* state) { if (state->BackgroundThreadStarted) { *state->BackgroundThreadStarted = started; } }); } }; TExplicitlyConstructableSingleton BackgroundThread; //////////////////////////////////////////////////////////////////////////////// Y_FORCE_INLINE TThreadState* TThreadManager::GetThreadStateUnchecked() { YTALLOC_PARANOID_ASSERT(ThreadState_); return ThreadState_; } Y_FORCE_INLINE TThreadState* TThreadManager::FindThreadState() { if (Y_LIKELY(ThreadState_)) { return ThreadState_; } if (ThreadStateDestroyed_) { return nullptr; } InitializeGlobals(); // InitializeGlobals must not allocate. Y_ABORT_UNLESS(!ThreadState_); ThreadState_ = ThreadManager->AllocateThreadState(); (&ThreadControlWord_)->Parts.ThreadStateValid = true; return ThreadState_; } void TThreadManager::DestroyThread(void*) { TSmallAllocator::PurgeCaches(); TThreadState* state = ThreadState_; ThreadState_ = nullptr; ThreadStateDestroyed_ = true; (&ThreadControlWord_)->Parts.ThreadStateValid = false; { auto guard = GuardWithTiming(ThreadManager->ThreadRegistryLock_); state->AllocationProfilingEnabled = nullptr; state->BackgroundThreadStarted = nullptr; ThreadManager->UnrefThreadState(state); } } void TThreadManager::DestroyThreadState(TThreadState* state) { StatisticsManager->AccumulateLocalCounters(state); ThreadRegistry_.Remove(state); ThreadStatePool_.Free(state); } void TThreadManager::AfterFork() { auto guard = GuardWithTiming(ThreadRegistryLock_); ThreadRegistry_.Clear(); TThreadState* state = ThreadState_; if (state) { ThreadRegistry_.PushBack(state); } } TThreadState* TThreadManager::AllocateThreadState() { auto* state = ThreadStatePool_.Allocate(); state->AllocationProfilingEnabled = &(*&ThreadControlWord_).Parts.AllocationProfilingEnabled; state->BackgroundThreadStarted = &(*&ThreadControlWord_).Parts.BackgroundThreadStarted; { auto guard = GuardWithTiming(ThreadRegistryLock_); // NB: These flags must be initialized under ThreadRegistryLock_; see EnumerateThreadStatesSync. *state->AllocationProfilingEnabled = ConfigurationManager->IsAllocationProfilingEnabled(); *state->BackgroundThreadStarted = BackgroundThread->IsStarted(); ThreadRegistry_.PushBack(state); } // Need to pass some non-null value for DestroyThread to be called. pthread_setspecific(ThreadDtorKey_, (void*)-1); return state; } //////////////////////////////////////////////////////////////////////////////// void InitializeGlobals() { static std::once_flag Initialized; std::call_once(Initialized, [] () { LogManager.Construct(); BacktraceManager.Construct(); StatisticsManager.Construct(); MappedMemoryManager.Construct(); ThreadManager.Construct(); GlobalState.Construct(); DumpableLargeBlobAllocator.Construct(); UndumpableLargeBlobAllocator.Construct(); HugeBlobAllocator.Construct(); ConfigurationManager.Construct(); SystemAllocator.Construct(); TimingManager.Construct(); MmapObservationManager.Construct(); StockpileManager.Construct(); BackgroundThread.Construct(); SmallArenaAllocators.Construct(); auto constructSmallArenaAllocators = [&] (EAllocationKind kind, uintptr_t zonesStart) { for (size_t rank = 1; rank < SmallRankCount; ++rank) { (*SmallArenaAllocators)[kind][rank].Construct(kind, rank, zonesStart + rank * SmallZoneSize); } }; constructSmallArenaAllocators(EAllocationKind::Untagged, UntaggedSmallZonesStart); constructSmallArenaAllocators(EAllocationKind::Tagged, TaggedSmallZonesStart); GlobalSmallChunkCaches.Construct(); (*GlobalSmallChunkCaches)[EAllocationKind::Tagged].Construct(EAllocationKind::Tagged); (*GlobalSmallChunkCaches)[EAllocationKind::Untagged].Construct(EAllocationKind::Untagged); }); } //////////////////////////////////////////////////////////////////////////////// void StartBackgroundThread() { InitializeGlobals(); BackgroundThread->SpawnIfNeeded(); } //////////////////////////////////////////////////////////////////////////////// template Y_FORCE_INLINE void* AllocateSmallUntagged(size_t rank, Ts... args) { auto* result = TSmallAllocator::Allocate(NullMemoryTag, rank, std::forward(args)...); YTALLOC_PARANOID_ASSERT(reinterpret_cast(result) >= MinUntaggedSmallPtr && reinterpret_cast(result) < MaxUntaggedSmallPtr); return result; } template Y_FORCE_INLINE void* AllocateSmallTagged(ui64 controlWord, size_t rank, Ts... args) { auto tag = Y_UNLIKELY((controlWord & TThreadManager::AllocationProfilingEnabledControlWordMask) && ConfigurationManager->IsSmallArenaAllocationProfiled(rank)) ? BacktraceManager->GetMemoryTagFromBacktrace(2) : static_cast(controlWord & TThreadManager::MemoryTagControlWordMask); auto* result = TSmallAllocator::Allocate(tag, rank, std::forward(args)...); YTALLOC_PARANOID_ASSERT(reinterpret_cast(result) >= MinTaggedSmallPtr && reinterpret_cast(result) < MaxTaggedSmallPtr); return result; } Y_FORCE_INLINE void* AllocateInline(size_t size) { size_t rank; if (Y_LIKELY(size <= 512)) { rank = SizeToSmallRank1[(size + 7) >> 3]; } else if (Y_LIKELY(size < LargeAllocationSizeThreshold)) { rank = SizeToSmallRank2[(size - 1) >> 8]; } else { StartBackgroundThread(); return TBlobAllocator::Allocate(size); } auto controlWord = TThreadManager::GetThreadControlWord(); if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) { return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked()); } if (Y_UNLIKELY(!(controlWord & TThreadManager::BackgroundThreadStartedControlWorkMask))) { StartBackgroundThread(); } if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) { return AllocateSmallUntagged(rank); } else { return AllocateSmallTagged(controlWord, rank); } } Y_FORCE_INLINE void* AllocateSmallInline(size_t rank) { auto controlWord = TThreadManager::GetThreadControlWord(); if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) { return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked()); } if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) { return AllocateSmallUntagged(rank); } else { return AllocateSmallTagged(controlWord, rank); } } Y_FORCE_INLINE void* AllocatePageAlignedInline(size_t size) { size = std::max(AlignUp(size, PageSize), PageSize); void* result = size >= LargeAllocationSizeThreshold ? AlignUp(TBlobAllocator::Allocate(size + PageSize), PageSize) : Allocate(size); YTALLOC_ASSERT(reinterpret_cast(result) % PageSize == 0); return result; } Y_FORCE_INLINE void FreeNonNullInline(void* ptr) { YTALLOC_ASSERT(ptr); if (Y_LIKELY(reinterpret_cast(ptr) < UntaggedSmallZonesEnd)) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= MinUntaggedSmallPtr && reinterpret_cast(ptr) < MaxUntaggedSmallPtr); TSmallAllocator::Free(ptr); } else if (Y_LIKELY(reinterpret_cast(ptr) < TaggedSmallZonesEnd)) { YTALLOC_PARANOID_ASSERT(reinterpret_cast(ptr) >= MinTaggedSmallPtr && reinterpret_cast(ptr) < MaxTaggedSmallPtr); TSmallAllocator::Free(ptr); } else { TBlobAllocator::Free(ptr); } } Y_FORCE_INLINE void FreeInline(void* ptr) { if (Y_LIKELY(ptr)) { FreeNonNullInline(ptr); } } Y_FORCE_INLINE size_t GetAllocationSizeInline(const void* ptr) { if (Y_UNLIKELY(!ptr)) { return 0; } auto uintptr = reinterpret_cast(ptr); if (uintptr < UntaggedSmallZonesEnd) { YTALLOC_PARANOID_ASSERT(uintptr >= MinUntaggedSmallPtr && uintptr < MaxUntaggedSmallPtr); return TSmallAllocator::GetAllocationSize(ptr); } else if (uintptr < TaggedSmallZonesEnd) { YTALLOC_PARANOID_ASSERT(uintptr >= MinTaggedSmallPtr && uintptr < MaxTaggedSmallPtr); return TSmallAllocator::GetAllocationSize(ptr); } else if (uintptr < LargeZoneEnd(true)) { YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(true) && uintptr < LargeZoneEnd(true)); return TLargeBlobAllocator::GetAllocationSize(ptr); } else if (uintptr < LargeZoneEnd(false)) { YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(false) && uintptr < LargeZoneEnd(false)); return TLargeBlobAllocator::GetAllocationSize(ptr); } else if (uintptr < HugeZoneEnd) { YTALLOC_PARANOID_ASSERT(uintptr >= HugeZoneStart && uintptr < HugeZoneEnd); return THugeBlobAllocator::GetAllocationSize(ptr); } else { YTALLOC_TRAP("Wrong ptr passed to GetAllocationSizeInline"); } } Y_FORCE_INLINE size_t GetAllocationSizeInline(size_t size) { if (size <= LargeAllocationSizeThreshold) { return TSmallAllocator::GetAllocationSize(size); } else if (size <= HugeAllocationSizeThreshold) { return TLargeBlobAllocator::GetAllocationSize(size); } else { return THugeBlobAllocator::GetAllocationSize(size); } } void EnableLogging(TLogHandler logHandler) { InitializeGlobals(); LogManager->EnableLogging(logHandler); } void SetBacktraceProvider(TBacktraceProvider provider) { InitializeGlobals(); BacktraceManager->SetBacktraceProvider(provider); DumpableLargeBlobAllocator->SetBacktraceProvider(provider); UndumpableLargeBlobAllocator->SetBacktraceProvider(provider); } void SetBacktraceFormatter(TBacktraceFormatter provider) { InitializeGlobals(); MmapObservationManager->SetBacktraceFormatter(provider); } void EnableStockpile() { InitializeGlobals(); ConfigurationManager->EnableStockpile(); } void SetStockpileInterval(TDuration value) { InitializeGlobals(); ConfigurationManager->SetStockpileInterval(value); } void SetStockpileThreadCount(int value) { InitializeGlobals(); ConfigurationManager->SetStockpileThreadCount(value); } void SetStockpileSize(size_t value) { InitializeGlobals(); ConfigurationManager->SetStockpileSize(value); } void SetLargeUnreclaimableCoeff(double value) { InitializeGlobals(); ConfigurationManager->SetLargeUnreclaimableCoeff(value); } void SetTimingEventThreshold(TDuration value) { InitializeGlobals(); ConfigurationManager->SetTimingEventThreshold(value); } void SetMinLargeUnreclaimableBytes(size_t value) { InitializeGlobals(); ConfigurationManager->SetMinLargeUnreclaimableBytes(value); } void SetMaxLargeUnreclaimableBytes(size_t value) { InitializeGlobals(); ConfigurationManager->SetMaxLargeUnreclaimableBytes(value); } void SetAllocationProfilingEnabled(bool value) { ConfigurationManager->SetAllocationProfilingEnabled(value); } void SetAllocationProfilingSamplingRate(double rate) { ConfigurationManager->SetAllocationProfilingSamplingRate(rate); } void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value) { ConfigurationManager->SetSmallArenaAllocationProfilingEnabled(rank, value); } void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value) { ConfigurationManager->SetLargeArenaAllocationProfilingEnabled(rank, value); } void SetProfilingBacktraceDepth(int depth) { ConfigurationManager->SetProfilingBacktraceDepth(depth); } void SetMinProfilingBytesUsedToReport(size_t size) { ConfigurationManager->SetMinProfilingBytesUsedToReport(size); } void SetEnableEagerMemoryRelease(bool value) { ConfigurationManager->SetEnableEagerMemoryRelease(value); } void SetEnableMadvisePopulate(bool value) { ConfigurationManager->SetEnableMadvisePopulate(value); } TEnumIndexedArray GetTotalAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetTotalAllocationCounters(); } TEnumIndexedArray GetSystemAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetSystemAllocationCounters(); } TEnumIndexedArray GetUndumpableAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetUndumpableAllocationCounters(); } TEnumIndexedArray GetSmallAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetSmallAllocationCounters(); } TEnumIndexedArray GetLargeAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetLargeAllocationCounters(); } std::array, SmallRankCount> GetSmallArenaAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetSmallArenaAllocationCounters(); } std::array, LargeRankCount> GetLargeArenaAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetLargeArenaAllocationCounters(); } TEnumIndexedArray GetHugeAllocationCounters() { InitializeGlobals(); return StatisticsManager->GetHugeAllocationCounters(); } std::vector GetProfiledAllocationStatistics() { InitializeGlobals(); if (!ConfigurationManager->IsAllocationProfilingEnabled()) { return {}; } std::vector tags; tags.reserve(MaxCapturedAllocationBacktraces + 1); for (TMemoryTag tag = AllocationProfilingMemoryTagBase; tag < AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces; ++tag) { tags.push_back(tag); } tags.push_back(AllocationProfilingUnknownMemoryTag); std::vector> counters; counters.resize(tags.size()); StatisticsManager->GetTaggedMemoryCounters(tags.data(), tags.size(), counters.data()); std::vector statistics; for (size_t index = 0; index < tags.size(); ++index) { if (counters[index][EBasicCounter::BytesUsed] < static_cast(ConfigurationManager->GetMinProfilingBytesUsedToReport())) { continue; } auto tag = tags[index]; auto optionalBacktrace = BacktraceManager->FindBacktrace(tag); if (!optionalBacktrace && tag != AllocationProfilingUnknownMemoryTag) { continue; } statistics.push_back(TProfiledAllocation{ optionalBacktrace.value_or(TBacktrace()), counters[index] }); } return statistics; } TEnumIndexedArray GetTimingEventCounters() { InitializeGlobals(); return TimingManager->GetTimingEventCounters(); } //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NYTAlloc