balloc.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. #pragma once
  2. #include <sys/mman.h>
  3. #include <pthread.h>
  4. #include <dlfcn.h>
  5. #include <errno.h>
  6. #include <stdlib.h>
  7. #include <stdio.h>
  8. #include <string.h>
  9. #include <memory.h>
  10. #include <new>
  11. #include <util/system/defaults.h>
  12. #include <library/cpp/malloc/api/malloc.h>
  13. #include <library/cpp/balloc/lib/alloc_stats.h>
  14. #include <library/cpp/balloc/setup/alloc.h>
  15. #ifndef NDEBUG
  16. #define DBG_FILL_MEMORY
  17. #endif
  18. #if defined(Y_COVER_PTR)
  19. #define DBG_FILL_MEMORY
  20. #endif
  21. #if (defined(_i386_) || defined(_x86_64_)) && defined(_linux_)
  22. #define HAVE_VDSO_GETCPU 1
  23. #include <contrib/libs/linuxvdso/interface.h>
  24. #endif
  25. namespace NBalloc {
  26. #if HAVE_VDSO_GETCPU
  27. // glibc does not provide a wrapper around getcpu, we'll have to load it manually
  28. static int (*getcpu)(unsigned* cpu, unsigned* node, void* unused) = nullptr;
  29. #endif
  30. static Y_FORCE_INLINE void* Advance(void* block, size_t size) {
  31. return (void*)((char*)block + size);
  32. }
  33. static constexpr size_t PAGE_CACHE = 16;
  34. #if defined(_ppc64_) || defined(_arm64_)
  35. static constexpr size_t PAGE_ELEM = 65536;
  36. #else
  37. static constexpr size_t PAGE_ELEM = 4096;
  38. #endif
  39. static constexpr size_t SINGLE_ALLOC = (PAGE_ELEM / 2);
  40. static constexpr size_t ORDERS = 1024;
  41. static constexpr size_t DUMP_STAT = 0;
  42. static void* (*LibcMalloc)(size_t) = nullptr;
  43. static void (*LibcFree)(void*) = nullptr;
  44. static size_t Y_FORCE_INLINE Align(size_t value, size_t align) {
  45. return (value + align - 1) & ~(align - 1);
  46. }
  47. #define RDTSC(eax, edx) __asm__ __volatile__("rdtsc" \
  48. : "=a"(eax), "=d"(edx));
  49. #define CPUID(func, eax, ebx, ecx, edx) __asm__ __volatile__("cpuid" \
  50. : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \
  51. : "a"(func));
  52. static int GetNumaNode() {
  53. #if HAVE_VDSO_GETCPU
  54. if (Y_LIKELY(getcpu)) {
  55. unsigned node = 0;
  56. if (getcpu(nullptr, &node, nullptr)) {
  57. return 0;
  58. }
  59. return node;
  60. }
  61. #endif
  62. #if defined(_i386_) or defined(_x86_64_)
  63. int a = 0, b = 0, c = 0, d = 0;
  64. CPUID(0x1, a, b, c, d);
  65. int acpiID = (b >> 24);
  66. int numCPU = (b >> 16) & 255;
  67. if (numCPU == 0)
  68. return 0;
  69. int ret = acpiID / numCPU;
  70. return ret;
  71. #else
  72. return 0;
  73. #endif
  74. }
  75. static void AbortFromSystemError() {
  76. char buf[512] = {0};
  77. #if defined(_freebsd_) or defined(_darwin_) or defined(_musl_) or defined(_bionic_)
  78. strerror_r(errno, buf, sizeof(buf));
  79. const char* msg = buf;
  80. #elif defined(_linux_) or defined(_cygwin_)
  81. const char* msg = strerror_r(errno, buf, sizeof(buf));
  82. #endif
  83. NMalloc::AbortFromCorruptedAllocator(msg);
  84. }
  85. static pthread_key_t key;
  86. static volatile long init = 0;
  87. static unsigned long long counter = 0;
  88. static void Destructor(void* data);
  89. template <class T>
  90. Y_FORCE_INLINE bool DoCas(T* volatile* target, T* exchange, T* compare) {
  91. return __sync_bool_compare_and_swap(target, compare, exchange);
  92. }
  93. class TLFAllocFreeList {
  94. struct TNode {
  95. TNode* Next;
  96. };
  97. TNode* volatile Head;
  98. TNode* volatile Pending;
  99. long long volatile PendingToFreeListCounter;
  100. TNode* volatile Destroyed;
  101. long long AllocCount;
  102. static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
  103. for (;;) {
  104. TNode* volatile prevHead = *headPtr;
  105. n->Next = prevHead;
  106. if (DoCas(headPtr, n, prevHead))
  107. break;
  108. }
  109. }
  110. Y_FORCE_INLINE void* DoAlloc() {
  111. TNode* res;
  112. for (res = Head; res; res = Head) {
  113. TNode* keepNext = res->Next;
  114. if (DoCas(&Head, keepNext, res)) {
  115. //Y_ABORT_UNLESS(keepNext == res->Next);
  116. break;
  117. }
  118. }
  119. return res;
  120. }
  121. void FreeList(TNode* fl) {
  122. if (!fl)
  123. return;
  124. TNode* flTail = fl;
  125. while (flTail->Next)
  126. flTail = flTail->Next;
  127. for (;;) {
  128. TNode* volatile prevHead = Head;
  129. flTail->Next = prevHead;
  130. if (DoCas(&Head, fl, prevHead))
  131. break;
  132. }
  133. }
  134. public:
  135. Y_FORCE_INLINE void Free(void* ptr) {
  136. TNode* newFree = (TNode*)ptr;
  137. if (__sync_add_and_fetch(&AllocCount, 0) == 0)
  138. Enqueue(&Head, newFree);
  139. else
  140. Enqueue(&Pending, newFree);
  141. }
  142. Y_FORCE_INLINE void Destroy(void* ptr, size_t length) {
  143. TNode* newFree = (TNode*)ptr;
  144. TNode* fl = nullptr;
  145. if (__sync_add_and_fetch(&AllocCount, 1) == 1) {
  146. fl = Destroyed;
  147. if (fl && !DoCas(&Destroyed, (TNode*)nullptr, fl)) {
  148. fl = nullptr;
  149. }
  150. Enqueue(&fl, newFree);
  151. } else {
  152. Enqueue(&Destroyed, newFree);
  153. }
  154. __sync_sub_and_fetch(&AllocCount, 1);
  155. // TODO try to merge blocks to minimize number of syscalls
  156. while (nullptr != fl) {
  157. TNode* next = fl->Next;
  158. if (-1 == munmap(fl, length)) {
  159. AbortFromSystemError();
  160. }
  161. fl = next;
  162. }
  163. }
  164. Y_FORCE_INLINE void* Alloc() {
  165. long long volatile keepCounter = __sync_add_and_fetch(&PendingToFreeListCounter, 0);
  166. TNode* fl = Pending;
  167. if (__sync_add_and_fetch(&AllocCount, 1) == 1) {
  168. // No other allocs in progress.
  169. // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
  170. // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
  171. if (fl &&
  172. keepCounter == __sync_add_and_fetch(&PendingToFreeListCounter, 0) &&
  173. DoCas(&Pending, (TNode*)nullptr, fl))
  174. {
  175. // pick first element from Pending and return it
  176. void* res = fl;
  177. fl = fl->Next;
  178. // if there are other elements in Pending list, add them to main free list
  179. FreeList(fl);
  180. __sync_sub_and_fetch(&PendingToFreeListCounter, 1);
  181. __sync_sub_and_fetch(&AllocCount, 1);
  182. return res;
  183. }
  184. }
  185. void* res = DoAlloc();
  186. if (!res && __sync_add_and_fetch(&Pending, 0)) {
  187. // live-lock situation: there are no free items in the "Head"
  188. // list and there are free items in the "Pending" list
  189. // but the items are forbidden to allocate to prevent ABA
  190. NAllocStats::IncLiveLockCounter();
  191. }
  192. __sync_sub_and_fetch(&AllocCount, 1);
  193. return res;
  194. }
  195. };
  196. TLFAllocFreeList nodes[2][ORDERS];
  197. unsigned long long sizesGC[2][16];
  198. unsigned long long sizeOS, totalOS;
  199. struct TBlockHeader {
  200. size_t Size;
  201. int RefCount;
  202. unsigned short AllCount;
  203. unsigned short NumaNode;
  204. };
  205. static bool PushPage(void* page, size_t order) {
  206. if (order < ORDERS) {
  207. int node = ((TBlockHeader*)page)->NumaNode;
  208. __sync_add_and_fetch(&sizesGC[node][order % 16], order);
  209. TBlockHeader* blockHeader = (TBlockHeader*)page;
  210. if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) {
  211. NMalloc::AbortFromCorruptedAllocator();
  212. }
  213. nodes[node][order].Free(page);
  214. return true;
  215. }
  216. return false;
  217. }
  218. static void* PopPage(size_t order) {
  219. if (order < ORDERS) {
  220. int numaNode = GetNumaNode() & 1;
  221. void* alloc = nodes[numaNode][order].Alloc();
  222. if (alloc == nullptr) {
  223. alloc = nodes[1 - numaNode][order].Alloc();
  224. if (alloc) {
  225. __sync_sub_and_fetch(&sizesGC[1 - numaNode][order % 16], order);
  226. }
  227. } else {
  228. __sync_sub_and_fetch(&sizesGC[numaNode][order % 16], order);
  229. }
  230. if (alloc) {
  231. TBlockHeader* blockHeader = (TBlockHeader*)alloc;
  232. if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, -1, 0)) {
  233. NMalloc::AbortFromCorruptedAllocator();
  234. }
  235. }
  236. return alloc;
  237. }
  238. return nullptr;
  239. }
  240. #if DUMP_STAT
  241. static unsigned long long TickCounter() {
  242. int lo = 0, hi = 0;
  243. RDTSC(lo, hi);
  244. return (((unsigned long long)hi) << 32) + (unsigned long long)lo;
  245. }
  246. struct TTimeHold {
  247. unsigned long long Start;
  248. unsigned long long Finish;
  249. const char* Name;
  250. TTimeHold(const char* name)
  251. : Start(TickCounter())
  252. , Name(name)
  253. {
  254. }
  255. ~TTimeHold() {
  256. Finish = TickCounter();
  257. double diff = Finish > Start ? (Finish - Start) / 1000000.0 : 0.0;
  258. if (diff > 20.0) {
  259. fprintf(stderr, "%s %f mticks\n", diff, Name);
  260. }
  261. }
  262. };
  263. #endif
  264. long long allocs[ORDERS];
  265. static void Map(size_t size, void* pages[], size_t num) {
  266. #if DUMP_STAT
  267. TTimeHold hold("mmap");
  268. size_t order = size / PAGE_ELEM;
  269. if (order < ORDERS) {
  270. __sync_add_and_fetch(&allocs[order], num);
  271. }
  272. #endif
  273. if (!NAllocSetup::CanAlloc(__sync_add_and_fetch(&sizeOS, size * num), totalOS)) {
  274. NMalloc::AbortFromCorruptedAllocator();
  275. }
  276. void* map = mmap(nullptr, size * num, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
  277. if (map == MAP_FAILED) {
  278. AbortFromSystemError();
  279. }
  280. unsigned short numaNode = (GetNumaNode() & 1);
  281. NAllocStats::IncMmapCounter(size * num / PAGE_ELEM);
  282. for (size_t i = 0; i < num; ++i) {
  283. TBlockHeader* blockHeader = static_cast<TBlockHeader*>(map);
  284. blockHeader->NumaNode = numaNode;
  285. pages[i] = map;
  286. map = Advance(map, size);
  287. }
  288. }
  289. static void* SysAlloc(size_t& size) {
  290. size = Align(size, PAGE_ELEM);
  291. size_t order = size / PAGE_ELEM;
  292. void* result = PopPage(order);
  293. if (result) {
  294. return result;
  295. }
  296. void* pages[1] = {nullptr};
  297. Map(size, pages, 1);
  298. return pages[0];
  299. }
  300. static void UnMap(void* block, size_t order) {
  301. #if DUMP_STAT
  302. TTimeHold hold("munmap");
  303. if (order < ORDERS) {
  304. __sync_sub_and_fetch(&allocs[order], 1);
  305. }
  306. #endif
  307. size_t size = order * PAGE_ELEM;
  308. __sync_sub_and_fetch(&sizeOS, size);
  309. TBlockHeader* blockHeader = (TBlockHeader*)block;
  310. if (!__sync_bool_compare_and_swap(&blockHeader->RefCount, 0, -1)) {
  311. NMalloc::AbortFromCorruptedAllocator();
  312. }
  313. if (order < ORDERS) {
  314. int node = blockHeader->NumaNode;
  315. nodes[node][order].Destroy(block, size);
  316. } else {
  317. if (-1 == munmap(block, size)) {
  318. AbortFromSystemError();
  319. }
  320. }
  321. }
  322. static void SysClear(size_t order) {
  323. void* page = PopPage(order);
  324. if (page) {
  325. UnMap(page, order);
  326. }
  327. }
  328. static void Y_FORCE_INLINE GlobalInit() {
  329. if (__sync_bool_compare_and_swap(&init, 0, 1)) {
  330. #if HAVE_VDSO_GETCPU
  331. getcpu = (int (*)(unsigned*, unsigned*, void*))NVdso::Function("__vdso_getcpu", "LINUX_2.6");
  332. #endif
  333. LibcMalloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc");
  334. LibcFree = (void (*)(void*))dlsym(RTLD_NEXT, "free");
  335. pthread_key_create(&key, Destructor);
  336. __sync_bool_compare_and_swap(&init, 1, 2);
  337. }
  338. while (init < 2) {
  339. };
  340. }
  341. enum EMode {
  342. Empty = 0,
  343. Born,
  344. Alive,
  345. Disabled,
  346. Dead,
  347. ToBeEnabled
  348. };
  349. struct TLS {
  350. void* PageCache[PAGE_CACHE];
  351. size_t Cached;
  352. void* Chunk;
  353. size_t Ptr;
  354. void* Block;
  355. int Counter;
  356. EMode Mode;
  357. unsigned char Count0;
  358. unsigned long Count1;
  359. bool NeedGC() {
  360. if (Count0++ != 0)
  361. return false;
  362. __sync_add_and_fetch(&totalOS, 1);
  363. unsigned long long count = 0;
  364. for (size_t i = 0; i < 16; ++i) {
  365. count += sizesGC[0][i];
  366. count += sizesGC[1][i];
  367. }
  368. return NAllocSetup::NeedReclaim(count * PAGE_ELEM, ++Count1);
  369. }
  370. void ClearCount() {
  371. Count1 = 0;
  372. }
  373. };
  374. #if defined(_darwin_)
  375. static Y_FORCE_INLINE TLS* PthreadTls() {
  376. GlobalInit();
  377. TLS* ptls = (TLS*)pthread_getspecific(key);
  378. if (!ptls) {
  379. ptls = (TLS*)LibcMalloc(sizeof(TLS));
  380. if (!ptls) {
  381. NMalloc::AbortFromCorruptedAllocator(); // what do we do here?
  382. }
  383. memset(ptls, 0, sizeof(TLS));
  384. pthread_setspecific(key, ptls);
  385. }
  386. return ptls;
  387. }
  388. #define tls (*PthreadTls())
  389. #else
  390. __thread TLS tls;
  391. #endif
  392. static void UnRefHard(void* block, int add, TLS& ltls) {
  393. TBlockHeader* blockHeader = (TBlockHeader*)block;
  394. if ((blockHeader->RefCount == add ? (blockHeader->RefCount = 0, true) : false) || __sync_sub_and_fetch(&blockHeader->RefCount, add) == 0) {
  395. size_t order = blockHeader->Size / PAGE_ELEM;
  396. if (ltls.Mode == Alive) {
  397. // page cache has first priority
  398. if (order == 1 && ltls.Cached < PAGE_CACHE) {
  399. ltls.PageCache[ltls.Cached] = block;
  400. ++ltls.Cached;
  401. return;
  402. }
  403. if (ltls.NeedGC()) {
  404. ltls.ClearCount();
  405. size_t index = __sync_add_and_fetch(&counter, 1);
  406. SysClear(index % ORDERS);
  407. UnMap(block, order);
  408. return;
  409. }
  410. }
  411. if (!PushPage(block, order)) {
  412. UnMap(block, order);
  413. }
  414. }
  415. }
  416. static void Init(TLS& ltls) {
  417. bool ShouldEnable = (NAllocSetup::IsEnabledByDefault() || ltls.Mode == ToBeEnabled);
  418. ltls.Mode = Born;
  419. GlobalInit();
  420. pthread_setspecific(key, (void*)&ltls);
  421. if (ShouldEnable) {
  422. ltls.Mode = Alive;
  423. } else {
  424. ltls.Mode = Disabled;
  425. }
  426. }
  427. static void Y_FORCE_INLINE UnRef(void* block, int counter, TLS& ltls) {
  428. if (ltls.Mode != Alive) {
  429. UnRefHard(block, counter, ltls);
  430. return;
  431. }
  432. if (ltls.Block == block) {
  433. ltls.Counter += counter;
  434. } else {
  435. if (ltls.Block) {
  436. UnRefHard(ltls.Block, ltls.Counter, ltls);
  437. }
  438. ltls.Block = block;
  439. ltls.Counter = counter;
  440. }
  441. }
  442. static void Destructor(void* data) {
  443. TLS& ltls = *(TLS*)data;
  444. ltls.Mode = Dead;
  445. if (ltls.Chunk) {
  446. TBlockHeader* blockHeader = (TBlockHeader*)ltls.Chunk;
  447. UnRef(ltls.Chunk, PAGE_ELEM - blockHeader->AllCount, ltls);
  448. }
  449. if (ltls.Block) {
  450. UnRef(ltls.Block, ltls.Counter, ltls);
  451. }
  452. for (size_t i = 0; i < ltls.Cached; ++i) {
  453. PushPage(ltls.PageCache[i], 1);
  454. }
  455. #if defined(_darwin_)
  456. LibcFree(data);
  457. #endif
  458. }
  459. using TAllocHeader = NMalloc::TAllocHeader;
  460. static Y_FORCE_INLINE TAllocHeader* AllocateRaw(size_t size, size_t signature) {
  461. TLS& ltls = tls;
  462. size = Align(size, sizeof(TAllocHeader));
  463. if (Y_UNLIKELY(ltls.Mode == Empty || ltls.Mode == ToBeEnabled)) {
  464. Init(ltls);
  465. }
  466. size_t extsize = size + sizeof(TAllocHeader) + sizeof(TBlockHeader);
  467. if (extsize > SINGLE_ALLOC || ltls.Mode != Alive) {
  468. // The dlsym() function in GlobalInit() may call malloc() resulting in recursive call
  469. // of the NBalloc::Malloc(). We have to serve such allocation request via balloc even
  470. // when (IsEnabledByDefault() == false) because at this point we don't know where the
  471. // libc malloc is.
  472. if (extsize > 64 * PAGE_ELEM) {
  473. extsize = Align(extsize, 16 * PAGE_ELEM);
  474. }
  475. NAllocSetup::ThrowOnError(extsize);
  476. void* block = SysAlloc(extsize);
  477. TBlockHeader* blockHeader = (TBlockHeader*)block;
  478. blockHeader->RefCount = 1;
  479. blockHeader->Size = extsize;
  480. blockHeader->AllCount = 0;
  481. TAllocHeader* allocHeader = (TAllocHeader*)Advance(block, sizeof(TBlockHeader));
  482. allocHeader->Encode(blockHeader, size, signature);
  483. if (NAllocStats::IsEnabled()) {
  484. NAllocStats::IncThreadAllocStats(size);
  485. }
  486. #ifdef DBG_FILL_MEMORY
  487. memset(allocHeader + 1, 0xec, size);
  488. #endif
  489. return allocHeader;
  490. }
  491. size_t ptr = ltls.Ptr;
  492. void* chunk = ltls.Chunk;
  493. if (ptr < extsize) {
  494. NAllocSetup::ThrowOnError(PAGE_ELEM);
  495. if (chunk) {
  496. TBlockHeader* blockHeader = (TBlockHeader*)chunk;
  497. UnRef(chunk, PAGE_ELEM - blockHeader->AllCount, ltls);
  498. }
  499. void* block = nullptr;
  500. while (1) {
  501. if (ltls.Cached > 0) {
  502. --ltls.Cached;
  503. block = ltls.PageCache[ltls.Cached];
  504. break;
  505. }
  506. block = PopPage(1);
  507. if (block) {
  508. break;
  509. }
  510. Map(PAGE_ELEM, ltls.PageCache, PAGE_CACHE);
  511. ltls.Cached = PAGE_CACHE;
  512. }
  513. TBlockHeader* blockHeader = (TBlockHeader*)block;
  514. blockHeader->RefCount = PAGE_ELEM;
  515. blockHeader->Size = PAGE_ELEM;
  516. blockHeader->AllCount = 0;
  517. ltls.Ptr = PAGE_ELEM;
  518. ltls.Chunk = block;
  519. ptr = ltls.Ptr;
  520. chunk = ltls.Chunk;
  521. }
  522. ptr = ptr - size - sizeof(TAllocHeader);
  523. TAllocHeader* allocHeader = (TAllocHeader*)Advance(chunk, ptr);
  524. allocHeader->Encode(chunk, size, signature);
  525. TBlockHeader* blockHeader = (TBlockHeader*)chunk;
  526. ++blockHeader->AllCount;
  527. ltls.Ptr = ptr;
  528. if (NAllocStats::IsEnabled()) {
  529. NAllocStats::IncThreadAllocStats(size);
  530. }
  531. #ifdef DBG_FILL_MEMORY
  532. memset(allocHeader + 1, 0xec, size);
  533. #endif
  534. return allocHeader;
  535. }
  536. static void Y_FORCE_INLINE FreeRaw(void* ptr) {
  537. UnRef(ptr, 1, tls);
  538. }
  539. }