core-inl.h 162 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849
  1. #pragma once
  2. // This file contains the core parts of YTAlloc but no malloc/free-bridge.
  3. // The latter bridge is placed into alloc.cpp, which includes (sic!) core-inl.h.
  4. // This ensures that AllocateInline/FreeInline calls are properly inlined into malloc/free.
  5. // Also core-inl.h can be directly included in, e.g., benchmarks.
  6. #include <library/cpp/yt/containers/intrusive_linked_list.h>
  7. #include <library/cpp/yt/memory/memory_tag.h>
  8. #include <library/cpp/yt/threading/at_fork.h>
  9. #include <library/cpp/yt/threading/fork_aware_spin_lock.h>
  10. #include <library/cpp/yt/memory/free_list.h>
  11. #include <util/system/tls.h>
  12. #include <util/system/align.h>
  13. #include <util/system/thread.h>
  14. #include <util/string/printf.h>
  15. #include <util/generic/singleton.h>
  16. #include <util/generic/size_literals.h>
  17. #include <util/generic/utility.h>
  18. #include <util/digest/numeric.h>
  19. #include <library/cpp/ytalloc/api/ytalloc.h>
  20. #include <atomic>
  21. #include <array>
  22. #include <vector>
  23. #include <mutex>
  24. #include <thread>
  25. #include <condition_variable>
  26. #include <cstdio>
  27. #include <optional>
  28. #include <sys/mman.h>
  29. #ifdef _linux_
  30. #include <sys/utsname.h>
  31. #endif
  32. #include <errno.h>
  33. #include <pthread.h>
  34. #include <time.h>
  35. #ifndef MAP_POPULATE
  36. #define MAP_POPULATE 0x08000
  37. #endif
  38. // MAP_FIXED which doesn't unmap underlying mapping.
  39. // Linux kernels older than 4.17 silently ignore this flag.
  40. #ifndef MAP_FIXED_NOREPLACE
  41. #ifdef _linux_
  42. #define MAP_FIXED_NOREPLACE 0x100000
  43. #else
  44. #define MAP_FIXED_NOREPLACE 0
  45. #endif
  46. #endif
  47. #ifndef MADV_POPULATE
  48. #define MADV_POPULATE 0x59410003
  49. #endif
  50. #ifndef MADV_STOCKPILE
  51. #define MADV_STOCKPILE 0x59410004
  52. #endif
  53. #ifndef MADV_FREE
  54. #define MADV_FREE 8
  55. #endif
  56. #ifndef MADV_DONTDUMP
  57. #define MADV_DONTDUMP 16
  58. #endif
  59. #ifndef NDEBUG
  60. #define YTALLOC_PARANOID
  61. #endif
  62. #ifdef YTALLOC_PARANOID
  63. #define YTALLOC_NERVOUS
  64. #endif
  65. #define YTALLOC_VERIFY(condition) \
  66. do { \
  67. if (Y_UNLIKELY(!(condition))) { \
  68. ::NYT::NYTAlloc::AssertTrap("Assertion failed: " #condition, __FILE__, __LINE__); \
  69. } \
  70. } while (false)
  71. #ifdef NDEBUG
  72. #define YTALLOC_ASSERT(condition) YTALLOC_VERIFY(condition)
  73. #else
  74. #define YTALLOC_ASSERT(condition) (void)(0)
  75. #endif
  76. #ifdef YTALLOC_PARANOID
  77. #define YTALLOC_PARANOID_ASSERT(condition) YTALLOC_VERIFY(condition)
  78. #else
  79. #define YTALLOC_PARANOID_ASSERT(condition) (true || (condition))
  80. #endif
  81. #define YTALLOC_TRAP(message) ::NYT::NYTAlloc::AssertTrap(message, __FILE__, __LINE__)
  82. namespace NYT::NYTAlloc {
  83. ////////////////////////////////////////////////////////////////////////////////
  84. // Allocations are classified into three types:
  85. //
  86. // a) Small chunks (less than LargeAllocationSizeThreshold)
  87. // These are the fastest and are extensively cached (both per-thread and globally).
  88. // Memory claimed for these allocations is never reclaimed back.
  89. // Code dealing with such allocations is heavy optimized with all hot paths
  90. // as streamlined as possible. The implementation is mostly inspired by LFAlloc.
  91. //
  92. // b) Large blobs (from LargeAllocationSizeThreshold to HugeAllocationSizeThreshold)
  93. // These are cached as well. We expect such allocations to be less frequent
  94. // than small ones but still do our best to provide good scalability.
  95. // In particular, thread-sharded concurrent data structures as used to provide access to
  96. // cached blobs. Memory is claimed via madvise(MADV_POPULATE) and reclaimed back
  97. // via madvise(MADV_FREE).
  98. //
  99. // c) Huge blobs (from HugeAllocationSizeThreshold)
  100. // These should be rare; we delegate directly to mmap and munmap for each allocation.
  101. //
  102. // We also provide a separate allocator for all system allocations (that are needed by YTAlloc itself).
  103. // These are rare and also delegate to mmap/unmap.
  104. // Periods between background activities.
  105. constexpr auto BackgroundInterval = TDuration::Seconds(1);
  106. static_assert(LargeRankCount - MinLargeRank <= 16, "Too many large ranks");
  107. static_assert(SmallRankCount <= 32, "Too many small ranks");
  108. constexpr size_t SmallZoneSize = 1_TB;
  109. constexpr size_t LargeZoneSize = 16_TB;
  110. constexpr size_t HugeZoneSize = 1_TB;
  111. constexpr size_t SystemZoneSize = 1_TB;
  112. constexpr size_t MaxCachedChunksPerRank = 256;
  113. constexpr uintptr_t UntaggedSmallZonesStart = 0;
  114. constexpr uintptr_t UntaggedSmallZonesEnd = UntaggedSmallZonesStart + 32 * SmallZoneSize;
  115. constexpr uintptr_t MinUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * 1;
  116. constexpr uintptr_t MaxUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
  117. constexpr uintptr_t TaggedSmallZonesStart = UntaggedSmallZonesEnd;
  118. constexpr uintptr_t TaggedSmallZonesEnd = TaggedSmallZonesStart + 32 * SmallZoneSize;
  119. constexpr uintptr_t MinTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * 1;
  120. constexpr uintptr_t MaxTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
  121. constexpr uintptr_t DumpableLargeZoneStart = TaggedSmallZonesEnd;
  122. constexpr uintptr_t DumpableLargeZoneEnd = DumpableLargeZoneStart + LargeZoneSize;
  123. constexpr uintptr_t UndumpableLargeZoneStart = DumpableLargeZoneEnd;
  124. constexpr uintptr_t UndumpableLargeZoneEnd = UndumpableLargeZoneStart + LargeZoneSize;
  125. constexpr uintptr_t LargeZoneStart(bool dumpable)
  126. {
  127. return dumpable ? DumpableLargeZoneStart : UndumpableLargeZoneStart;
  128. }
  129. constexpr uintptr_t LargeZoneEnd(bool dumpable)
  130. {
  131. return dumpable ? DumpableLargeZoneEnd : UndumpableLargeZoneEnd;
  132. }
  133. constexpr uintptr_t HugeZoneStart = UndumpableLargeZoneEnd;
  134. constexpr uintptr_t HugeZoneEnd = HugeZoneStart + HugeZoneSize;
  135. constexpr uintptr_t SystemZoneStart = HugeZoneEnd;
  136. constexpr uintptr_t SystemZoneEnd = SystemZoneStart + SystemZoneSize;
  137. // We leave 64_KB at the end of 256_MB block and never use it.
  138. // That serves two purposes:
  139. // 1. SmallExtentSize % SmallSegmentSize == 0
  140. // 2. Every small object satisfies RightReadableArea requirement.
  141. constexpr size_t SmallExtentAllocSize = 256_MB;
  142. constexpr size_t SmallExtentSize = SmallExtentAllocSize - 64_KB;
  143. constexpr size_t SmallSegmentSize = 96_KB; // LCM(SmallRankToSize)
  144. constexpr ui16 SmallRankBatchSize[SmallRankCount] = {
  145. 0, 256, 256, 256, 256, 256, 256, 256, 256, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3
  146. };
  147. constexpr bool CheckSmallSizes()
  148. {
  149. for (size_t rank = 0; rank < SmallRankCount; rank++) {
  150. auto size = SmallRankToSize[rank];
  151. if (size == 0) {
  152. continue;
  153. }
  154. if (SmallSegmentSize % size != 0) {
  155. return false;
  156. }
  157. if (SmallRankBatchSize[rank] > MaxCachedChunksPerRank) {
  158. return false;
  159. }
  160. }
  161. return true;
  162. }
  163. static_assert(CheckSmallSizes());
  164. static_assert(SmallExtentSize % SmallSegmentSize == 0);
  165. static_assert(SmallSegmentSize % PageSize == 0);
  166. constexpr size_t LargeExtentSize = 1_GB;
  167. static_assert(LargeExtentSize >= LargeAllocationSizeThreshold, "LargeExtentSize < LargeAllocationSizeThreshold");
  168. constexpr const char* BackgroundThreadName = "YTAllocBack";
  169. constexpr const char* StockpileThreadName = "YTAllocStock";
  170. DEFINE_ENUM(EAllocationKind,
  171. (Untagged)
  172. (Tagged)
  173. );
  174. // Forward declarations.
  175. struct TThreadState;
  176. struct TLargeArena;
  177. struct TLargeBlobExtent;
  178. ////////////////////////////////////////////////////////////////////////////////
  179. // Traps and assertions
  180. [[noreturn]]
  181. void OomTrap()
  182. {
  183. _exit(9);
  184. }
  185. [[noreturn]]
  186. void AssertTrap(const char* message, const char* file, int line)
  187. {
  188. ::fprintf(stderr, "*** YTAlloc has detected an internal trap at %s:%d\n*** %s\n",
  189. file,
  190. line,
  191. message);
  192. __builtin_trap();
  193. }
  194. template <class T, class E>
  195. void AssertBlobState(T* header, E expectedState)
  196. {
  197. auto actualState = header->State;
  198. if (Y_UNLIKELY(actualState != expectedState)) {
  199. char message[256];
  200. sprintf(message, "Invalid blob header state at %p: expected %" PRIx64 ", actual %" PRIx64,
  201. header,
  202. static_cast<ui64>(expectedState),
  203. static_cast<ui64>(actualState));
  204. YTALLOC_TRAP(message);
  205. }
  206. }
  207. ////////////////////////////////////////////////////////////////////////////////
  208. // Provides a never-dying singleton with explicit construction.
  209. template <class T>
  210. class TExplicitlyConstructableSingleton
  211. {
  212. public:
  213. TExplicitlyConstructableSingleton()
  214. { }
  215. ~TExplicitlyConstructableSingleton()
  216. { }
  217. template <class... Ts>
  218. void Construct(Ts&&... args)
  219. {
  220. new (&Storage_) T(std::forward<Ts>(args)...);
  221. #ifndef NDEBUG
  222. Constructed_ = true;
  223. #endif
  224. }
  225. Y_FORCE_INLINE T* Get()
  226. {
  227. #ifndef NDEBUG
  228. YTALLOC_PARANOID_ASSERT(Constructed_);
  229. #endif
  230. return &Storage_;
  231. }
  232. Y_FORCE_INLINE const T* Get() const
  233. {
  234. #ifndef NDEBUG
  235. YTALLOC_PARANOID_ASSERT(Constructed_);
  236. #endif
  237. return &Storage_;
  238. }
  239. Y_FORCE_INLINE T* operator->()
  240. {
  241. return Get();
  242. }
  243. Y_FORCE_INLINE const T* operator->() const
  244. {
  245. return Get();
  246. }
  247. Y_FORCE_INLINE T& operator*()
  248. {
  249. return *Get();
  250. }
  251. Y_FORCE_INLINE const T& operator*() const
  252. {
  253. return *Get();
  254. }
  255. private:
  256. union {
  257. T Storage_;
  258. };
  259. #ifndef NDEBUG
  260. bool Constructed_;
  261. #endif
  262. };
  263. ////////////////////////////////////////////////////////////////////////////////
  264. // Initializes all singletons.
  265. // Safe to call multiple times.
  266. // Guaranteed to not allocate.
  267. void InitializeGlobals();
  268. // Spawns the background thread, if it's time.
  269. // Safe to call multiple times.
  270. // Must be called on allocation slow path.
  271. void StartBackgroundThread();
  272. ////////////////////////////////////////////////////////////////////////////////
  273. class TLogManager
  274. {
  275. public:
  276. // Sets the handler to be invoked for each log event produced by YTAlloc.
  277. void EnableLogging(TLogHandler logHandler)
  278. {
  279. LogHandler_.store(logHandler);
  280. }
  281. // Checks (in a racy way) that logging is enabled.
  282. bool IsLoggingEnabled()
  283. {
  284. return LogHandler_.load() != nullptr;
  285. }
  286. // Logs the message via log handler (if any).
  287. template <class... Ts>
  288. void LogMessage(ELogEventSeverity severity, const char* format, Ts&&... args)
  289. {
  290. auto logHandler = LogHandler_.load();
  291. if (!logHandler) {
  292. return;
  293. }
  294. std::array<char, 16_KB> buffer;
  295. auto len = ::snprintf(buffer.data(), buffer.size(), format, std::forward<Ts>(args)...);
  296. TLogEvent event;
  297. event.Severity = severity;
  298. event.Message = TStringBuf(buffer.data(), len);
  299. logHandler(event);
  300. }
  301. // A special case of zero args.
  302. void LogMessage(ELogEventSeverity severity, const char* message)
  303. {
  304. LogMessage(severity, "%s", message);
  305. }
  306. private:
  307. std::atomic<TLogHandler> LogHandler_= nullptr;
  308. };
  309. TExplicitlyConstructableSingleton<TLogManager> LogManager;
  310. #define YTALLOC_LOG_EVENT(...) LogManager->LogMessage(__VA_ARGS__)
  311. #define YTALLOC_LOG_DEBUG(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Debug, __VA_ARGS__)
  312. #define YTALLOC_LOG_INFO(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Info, __VA_ARGS__)
  313. #define YTALLOC_LOG_WARNING(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Warning, __VA_ARGS__)
  314. #define YTALLOC_LOG_ERROR(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Error, __VA_ARGS__)
  315. ////////////////////////////////////////////////////////////////////////////////
  316. Y_FORCE_INLINE size_t GetUsed(ssize_t allocated, ssize_t freed)
  317. {
  318. return allocated >= freed ? static_cast<size_t>(allocated - freed) : 0;
  319. }
  320. template <class T>
  321. Y_FORCE_INLINE void* HeaderToPtr(T* header)
  322. {
  323. return header + 1;
  324. }
  325. template <class T>
  326. Y_FORCE_INLINE T* PtrToHeader(void* ptr)
  327. {
  328. return static_cast<T*>(ptr) - 1;
  329. }
  330. template <class T>
  331. Y_FORCE_INLINE const T* PtrToHeader(const void* ptr)
  332. {
  333. return static_cast<const T*>(ptr) - 1;
  334. }
  335. Y_FORCE_INLINE size_t PtrToSmallRank(const void* ptr)
  336. {
  337. return (reinterpret_cast<uintptr_t>(ptr) >> 40) & 0x1f;
  338. }
  339. Y_FORCE_INLINE char* AlignDownToSmallSegment(char* extent, char* ptr)
  340. {
  341. auto offset = static_cast<uintptr_t>(ptr - extent);
  342. // NB: This modulo operation is always performed using multiplication.
  343. offset -= (offset % SmallSegmentSize);
  344. return extent + offset;
  345. }
  346. Y_FORCE_INLINE char* AlignUpToSmallSegment(char* extent, char* ptr)
  347. {
  348. return AlignDownToSmallSegment(extent, ptr + SmallSegmentSize - 1);
  349. }
  350. template <class T>
  351. static Y_FORCE_INLINE void UnalignPtr(void*& ptr)
  352. {
  353. if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
  354. reinterpret_cast<char*&>(ptr) -= PageSize - sizeof (T);
  355. }
  356. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
  357. }
  358. template <class T>
  359. static Y_FORCE_INLINE void UnalignPtr(const void*& ptr)
  360. {
  361. if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
  362. reinterpret_cast<const char*&>(ptr) -= PageSize - sizeof (T);
  363. }
  364. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
  365. }
  366. template <class T>
  367. Y_FORCE_INLINE size_t GetRawBlobSize(size_t size)
  368. {
  369. return AlignUp(size + sizeof (T) + RightReadableAreaSize, PageSize);
  370. }
  371. template <class T>
  372. Y_FORCE_INLINE size_t GetBlobAllocationSize(size_t size)
  373. {
  374. size += sizeof(T);
  375. size += RightReadableAreaSize;
  376. size = AlignUp(size, PageSize);
  377. size -= sizeof(T);
  378. size -= RightReadableAreaSize;
  379. return size;
  380. }
  381. Y_FORCE_INLINE size_t GetLargeRank(size_t size)
  382. {
  383. size_t rank = 64 - __builtin_clzl(size);
  384. if (size == (1ULL << (rank - 1))) {
  385. --rank;
  386. }
  387. return rank;
  388. }
  389. Y_FORCE_INLINE void PoisonRange(void* ptr, size_t size, ui32 magic)
  390. {
  391. #ifdef YTALLOC_PARANOID
  392. size = ::AlignUp<size_t>(size, 4);
  393. std::fill(static_cast<ui32*>(ptr), static_cast<ui32*>(ptr) + size / 4, magic);
  394. #else
  395. Y_UNUSED(ptr);
  396. Y_UNUSED(size);
  397. Y_UNUSED(magic);
  398. #endif
  399. }
  400. Y_FORCE_INLINE void PoisonFreedRange(void* ptr, size_t size)
  401. {
  402. PoisonRange(ptr, size, 0xdeadbeef);
  403. }
  404. Y_FORCE_INLINE void PoisonUninitializedRange(void* ptr, size_t size)
  405. {
  406. PoisonRange(ptr, size, 0xcafebabe);
  407. }
  408. // Checks that the header size is divisible by 16 (as needed due to alignment restrictions).
  409. #define CHECK_HEADER_ALIGNMENT(T) static_assert(sizeof(T) % 16 == 0, "sizeof(" #T ") % 16 != 0");
  410. ////////////////////////////////////////////////////////////////////////////////
  411. static_assert(sizeof(TFreeList<void>) == CacheLineSize, "sizeof(TFreeList) != CacheLineSize");
  412. ////////////////////////////////////////////////////////////////////////////////
  413. constexpr size_t ShardCount = 16;
  414. std::atomic<size_t> GlobalCurrentShardIndex;
  415. // Provides a context for working with sharded data structures.
  416. // Captures the initial shard index upon construction (indicating the shard
  417. // where all insertions go). Maintains the current shard index (round-robin,
  418. // indicating the shard currently used for extraction).
  419. // Can be or be not thread-safe depending on TCounter.
  420. template <class TCounter>
  421. class TShardedState
  422. {
  423. public:
  424. TShardedState()
  425. : InitialShardIndex_(GlobalCurrentShardIndex++ % ShardCount)
  426. , CurrentShardIndex_(InitialShardIndex_)
  427. { }
  428. Y_FORCE_INLINE size_t GetInitialShardIndex() const
  429. {
  430. return InitialShardIndex_;
  431. }
  432. Y_FORCE_INLINE size_t GetNextShardIndex()
  433. {
  434. return ++CurrentShardIndex_ % ShardCount;
  435. }
  436. private:
  437. const size_t InitialShardIndex_;
  438. TCounter CurrentShardIndex_;
  439. };
  440. using TLocalShardedState = TShardedState<size_t>;
  441. using TGlobalShardedState = TShardedState<std::atomic<size_t>>;
  442. // Implemented as a collection of free lists (each called a shard).
  443. // One needs TShardedState to access the sharded data structure.
  444. template <class T>
  445. class TShardedFreeList
  446. {
  447. public:
  448. // First tries to extract an item from the initial shard;
  449. // if failed then proceeds to all shards in round-robin fashion.
  450. template <class TState>
  451. T* Extract(TState* state)
  452. {
  453. if (auto* item = Shards_[state->GetInitialShardIndex()].Extract()) {
  454. return item;
  455. }
  456. return ExtractRoundRobin(state);
  457. }
  458. // Attempts to extract an item from all shards in round-robin fashion.
  459. template <class TState>
  460. T* ExtractRoundRobin(TState* state)
  461. {
  462. for (size_t index = 0; index < ShardCount; ++index) {
  463. if (auto* item = Shards_[state->GetNextShardIndex()].Extract()) {
  464. return item;
  465. }
  466. }
  467. return nullptr;
  468. }
  469. // Extracts items from all shards linking them together.
  470. T* ExtractAll()
  471. {
  472. T* head = nullptr;
  473. T* tail = nullptr;
  474. for (auto& shard : Shards_) {
  475. auto* item = shard.ExtractAll();
  476. if (!head) {
  477. head = item;
  478. }
  479. if (tail) {
  480. YTALLOC_PARANOID_ASSERT(!tail->Next);
  481. tail->Next = item;
  482. } else {
  483. tail = item;
  484. }
  485. while (tail && tail->Next) {
  486. tail = tail->Next;
  487. }
  488. }
  489. return head;
  490. }
  491. template <class TState>
  492. void Put(TState* state, T* item)
  493. {
  494. Shards_[state->GetInitialShardIndex()].Put(item);
  495. }
  496. private:
  497. std::array<TFreeList<T>, ShardCount> Shards_;
  498. };
  499. ////////////////////////////////////////////////////////////////////////////////
  500. // Holds YTAlloc control knobs.
  501. // Thread safe.
  502. class TConfigurationManager
  503. {
  504. public:
  505. void SetLargeUnreclaimableCoeff(double value)
  506. {
  507. LargeUnreclaimableCoeff_.store(value);
  508. }
  509. double GetLargeUnreclaimableCoeff() const
  510. {
  511. return LargeUnreclaimableCoeff_.load(std::memory_order_relaxed);
  512. }
  513. void SetMinLargeUnreclaimableBytes(size_t value)
  514. {
  515. MinLargeUnreclaimableBytes_.store(value);
  516. }
  517. void SetMaxLargeUnreclaimableBytes(size_t value)
  518. {
  519. MaxLargeUnreclaimableBytes_.store(value);
  520. }
  521. size_t GetMinLargeUnreclaimableBytes() const
  522. {
  523. return MinLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
  524. }
  525. size_t GetMaxLargeUnreclaimableBytes() const
  526. {
  527. return MaxLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
  528. }
  529. void SetTimingEventThreshold(TDuration value)
  530. {
  531. TimingEventThresholdNs_.store(value.MicroSeconds() * 1000);
  532. }
  533. i64 GetTimingEventThresholdNs() const
  534. {
  535. return TimingEventThresholdNs_.load(std::memory_order_relaxed);
  536. }
  537. void SetAllocationProfilingEnabled(bool value);
  538. bool IsAllocationProfilingEnabled() const
  539. {
  540. return AllocationProfilingEnabled_.load();
  541. }
  542. Y_FORCE_INLINE bool GetAllocationProfilingSamplingRate()
  543. {
  544. return AllocationProfilingSamplingRate_.load();
  545. }
  546. void SetAllocationProfilingSamplingRate(double rate)
  547. {
  548. if (rate < 0) {
  549. rate = 0;
  550. }
  551. if (rate > 1) {
  552. rate = 1;
  553. }
  554. i64 rateX64K = static_cast<i64>(rate * (1ULL << 16));
  555. AllocationProfilingSamplingRateX64K_.store(ClampVal<ui32>(rateX64K, 0, std::numeric_limits<ui16>::max() + 1));
  556. AllocationProfilingSamplingRate_.store(rate);
  557. }
  558. Y_FORCE_INLINE bool IsSmallArenaAllocationProfilingEnabled(size_t rank)
  559. {
  560. return SmallArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
  561. }
  562. Y_FORCE_INLINE bool IsSmallArenaAllocationProfiled(size_t rank)
  563. {
  564. return IsSmallArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
  565. }
  566. void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
  567. {
  568. if (rank >= SmallRankCount) {
  569. return;
  570. }
  571. SmallArenaAllocationProfilingEnabled_[rank].store(value);
  572. }
  573. Y_FORCE_INLINE bool IsLargeArenaAllocationProfilingEnabled(size_t rank)
  574. {
  575. return LargeArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
  576. }
  577. Y_FORCE_INLINE bool IsLargeArenaAllocationProfiled(size_t rank)
  578. {
  579. return IsLargeArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
  580. }
  581. void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
  582. {
  583. if (rank >= LargeRankCount) {
  584. return;
  585. }
  586. LargeArenaAllocationProfilingEnabled_[rank].store(value);
  587. }
  588. Y_FORCE_INLINE int GetProfilingBacktraceDepth()
  589. {
  590. return ProfilingBacktraceDepth_.load();
  591. }
  592. void SetProfilingBacktraceDepth(int depth)
  593. {
  594. if (depth < 1) {
  595. return;
  596. }
  597. if (depth > MaxAllocationProfilingBacktraceDepth) {
  598. depth = MaxAllocationProfilingBacktraceDepth;
  599. }
  600. ProfilingBacktraceDepth_.store(depth);
  601. }
  602. Y_FORCE_INLINE size_t GetMinProfilingBytesUsedToReport()
  603. {
  604. return MinProfilingBytesUsedToReport_.load();
  605. }
  606. void SetMinProfilingBytesUsedToReport(size_t size)
  607. {
  608. MinProfilingBytesUsedToReport_.store(size);
  609. }
  610. void SetEnableEagerMemoryRelease(bool value)
  611. {
  612. EnableEagerMemoryRelease_.store(value);
  613. }
  614. bool GetEnableEagerMemoryRelease()
  615. {
  616. return EnableEagerMemoryRelease_.load(std::memory_order_relaxed);
  617. }
  618. void SetEnableMadvisePopulate(bool value)
  619. {
  620. EnableMadvisePopulate_.store(value);
  621. }
  622. bool GetEnableMadvisePopulate()
  623. {
  624. return EnableMadvisePopulate_.load(std::memory_order_relaxed);
  625. }
  626. void EnableStockpile()
  627. {
  628. StockpileEnabled_.store(true);
  629. }
  630. bool IsStockpileEnabled()
  631. {
  632. return StockpileEnabled_.load();
  633. }
  634. void SetStockpileInterval(TDuration value)
  635. {
  636. StockpileInterval_.store(value);
  637. }
  638. TDuration GetStockpileInterval()
  639. {
  640. return StockpileInterval_.load();
  641. }
  642. void SetStockpileThreadCount(int count)
  643. {
  644. StockpileThreadCount_.store(count);
  645. }
  646. int GetStockpileThreadCount()
  647. {
  648. return ClampVal(StockpileThreadCount_.load(), 0, MaxStockpileThreadCount);
  649. }
  650. void SetStockpileSize(size_t value)
  651. {
  652. StockpileSize_.store(value);
  653. }
  654. size_t GetStockpileSize()
  655. {
  656. return StockpileSize_.load();
  657. }
  658. private:
  659. std::atomic<double> LargeUnreclaimableCoeff_ = 0.05;
  660. std::atomic<size_t> MinLargeUnreclaimableBytes_ = 128_MB;
  661. std::atomic<size_t> MaxLargeUnreclaimableBytes_ = 10_GB;
  662. std::atomic<i64> TimingEventThresholdNs_ = 10000000; // in ns, 10 ms by default
  663. std::atomic<bool> AllocationProfilingEnabled_ = false;
  664. std::atomic<double> AllocationProfilingSamplingRate_ = 1.0;
  665. std::atomic<ui32> AllocationProfilingSamplingRateX64K_ = std::numeric_limits<ui32>::max();
  666. std::array<std::atomic<bool>, SmallRankCount> SmallArenaAllocationProfilingEnabled_ = {};
  667. std::array<std::atomic<bool>, LargeRankCount> LargeArenaAllocationProfilingEnabled_ = {};
  668. std::atomic<int> ProfilingBacktraceDepth_ = 10;
  669. std::atomic<size_t> MinProfilingBytesUsedToReport_ = 1_MB;
  670. std::atomic<bool> EnableEagerMemoryRelease_ = true;
  671. std::atomic<bool> EnableMadvisePopulate_ = false;
  672. std::atomic<bool> StockpileEnabled_ = false;
  673. std::atomic<TDuration> StockpileInterval_ = TDuration::MilliSeconds(10);
  674. static constexpr int MaxStockpileThreadCount = 8;
  675. std::atomic<int> StockpileThreadCount_ = 4;
  676. std::atomic<size_t> StockpileSize_ = 1_GB;
  677. private:
  678. bool IsAllocationSampled()
  679. {
  680. Y_POD_STATIC_THREAD(ui16) Counter;
  681. return Counter++ < AllocationProfilingSamplingRateX64K_.load();
  682. }
  683. };
  684. TExplicitlyConstructableSingleton<TConfigurationManager> ConfigurationManager;
  685. ////////////////////////////////////////////////////////////////////////////////
  686. template <class TEvent, class TManager>
  687. class TEventLogManagerBase
  688. {
  689. public:
  690. void DisableForCurrentThread()
  691. {
  692. TManager::DisabledForCurrentThread_ = true;
  693. }
  694. template <class... TArgs>
  695. void EnqueueEvent(TArgs&&... args)
  696. {
  697. if (TManager::DisabledForCurrentThread_) {
  698. return;
  699. }
  700. auto timestamp = TInstant::Now();
  701. auto fiberId = NYTAlloc::GetCurrentFiberId();
  702. auto guard = Guard(EventLock_);
  703. auto event = TEvent(args...);
  704. OnEvent(event);
  705. if (EventCount_ >= EventBufferSize) {
  706. return;
  707. }
  708. auto& enqueuedEvent = Events_[EventCount_++];
  709. enqueuedEvent = std::move(event);
  710. enqueuedEvent.Timestamp = timestamp;
  711. enqueuedEvent.FiberId = fiberId;
  712. }
  713. void RunBackgroundTasks()
  714. {
  715. if (LogManager->IsLoggingEnabled()) {
  716. for (const auto& event : PullEvents()) {
  717. ProcessEvent(event);
  718. }
  719. }
  720. }
  721. protected:
  722. NThreading::TForkAwareSpinLock EventLock_;
  723. virtual void OnEvent(const TEvent& event) = 0;
  724. virtual void ProcessEvent(const TEvent& event) = 0;
  725. private:
  726. static constexpr size_t EventBufferSize = 1000;
  727. size_t EventCount_ = 0;
  728. std::array<TEvent, EventBufferSize> Events_;
  729. std::vector<TEvent> PullEvents()
  730. {
  731. std::vector<TEvent> events;
  732. events.reserve(EventBufferSize);
  733. auto guard = Guard(EventLock_);
  734. for (size_t index = 0; index < EventCount_; ++index) {
  735. events.push_back(Events_[index]);
  736. }
  737. EventCount_ = 0;
  738. return events;
  739. }
  740. };
  741. ////////////////////////////////////////////////////////////////////////////////
  742. struct TTimingEvent
  743. {
  744. ETimingEventType Type;
  745. TDuration Duration;
  746. size_t Size;
  747. TInstant Timestamp;
  748. TFiberId FiberId;
  749. TTimingEvent()
  750. { }
  751. TTimingEvent(
  752. ETimingEventType type,
  753. TDuration duration,
  754. size_t size)
  755. : Type(type)
  756. , Duration(duration)
  757. , Size(size)
  758. { }
  759. };
  760. class TTimingManager
  761. : public TEventLogManagerBase<TTimingEvent, TTimingManager>
  762. {
  763. public:
  764. TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
  765. {
  766. auto guard = Guard(EventLock_);
  767. return EventCounters_;
  768. }
  769. private:
  770. TEnumIndexedArray<ETimingEventType, TTimingEventCounters> EventCounters_;
  771. Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
  772. friend class TEventLogManagerBase<TTimingEvent, TTimingManager>;
  773. virtual void OnEvent(const TTimingEvent& event) override
  774. {
  775. auto& counters = EventCounters_[event.Type];
  776. counters.Count += 1;
  777. counters.Size += event.Size;
  778. }
  779. virtual void ProcessEvent(const TTimingEvent& event) override
  780. {
  781. YTALLOC_LOG_DEBUG("Timing event logged (Type: %s, Duration: %s, Size: %zu, Timestamp: %s, FiberId: %" PRIu64 ")",
  782. ToString(event.Type).c_str(),
  783. ToString(event.Duration).c_str(),
  784. event.Size,
  785. ToString(event.Timestamp).c_str(),
  786. event.FiberId);
  787. }
  788. };
  789. Y_POD_THREAD(bool) TTimingManager::DisabledForCurrentThread_;
  790. TExplicitlyConstructableSingleton<TTimingManager> TimingManager;
  791. ////////////////////////////////////////////////////////////////////////////////
  792. i64 GetElapsedNs(const struct timespec& startTime, const struct timespec& endTime)
  793. {
  794. if (Y_LIKELY(startTime.tv_sec == endTime.tv_sec)) {
  795. return static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec);
  796. }
  797. return
  798. static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec) +
  799. (static_cast<i64>(endTime.tv_sec) - static_cast<i64>(startTime.tv_sec)) * 1000000000;
  800. }
  801. // Used to log statistics about long-running syscalls and lock acquisitions.
  802. class TTimingGuard
  803. : public TNonCopyable
  804. {
  805. public:
  806. explicit TTimingGuard(ETimingEventType eventType, size_t size = 0)
  807. : EventType_(eventType)
  808. , Size_(size)
  809. {
  810. ::clock_gettime(CLOCK_MONOTONIC, &StartTime_);
  811. }
  812. ~TTimingGuard()
  813. {
  814. auto elapsedNs = GetElapsedNs();
  815. if (elapsedNs > ConfigurationManager->GetTimingEventThresholdNs()) {
  816. TimingManager->EnqueueEvent(EventType_, TDuration::MicroSeconds(elapsedNs / 1000), Size_);
  817. }
  818. }
  819. private:
  820. const ETimingEventType EventType_;
  821. const size_t Size_;
  822. struct timespec StartTime_;
  823. i64 GetElapsedNs() const
  824. {
  825. struct timespec endTime;
  826. ::clock_gettime(CLOCK_MONOTONIC, &endTime);
  827. return NYTAlloc::GetElapsedNs(StartTime_, endTime);
  828. }
  829. };
  830. template <class T>
  831. Y_FORCE_INLINE TGuard<T> GuardWithTiming(const T& lock)
  832. {
  833. TTimingGuard timingGuard(ETimingEventType::Locking);
  834. TGuard<T> lockingGuard(lock);
  835. return lockingGuard;
  836. }
  837. ////////////////////////////////////////////////////////////////////////////////
  838. // A wrapper for mmap, mumap, and madvise calls.
  839. // The latter are invoked with MADV_POPULATE (if enabled) and MADV_FREE flags
  840. // and may fail if the OS support is missing. These failures are logged (once) and
  841. // handled as follows:
  842. // * if MADV_POPULATE fails then we fallback to manual per-page prefault
  843. // for all subsequent attempts;
  844. // * if MADV_FREE fails then it (and all subsequent attempts) is replaced with MADV_DONTNEED
  845. // (which is non-lazy and is less efficient but will somehow do).
  846. // Also this class mlocks all VMAs on startup to prevent pagefaults in our heavy binaries
  847. // from disturbing latency tails.
  848. class TMappedMemoryManager
  849. {
  850. public:
  851. void* Map(uintptr_t hint, size_t size, int flags)
  852. {
  853. TTimingGuard timingGuard(ETimingEventType::Mmap, size);
  854. auto* result = ::mmap(
  855. reinterpret_cast<void*>(hint),
  856. size,
  857. PROT_READ | PROT_WRITE,
  858. MAP_PRIVATE | MAP_ANONYMOUS | flags,
  859. -1,
  860. 0);
  861. if (result == MAP_FAILED) {
  862. auto error = errno;
  863. if (error == EEXIST && (flags & MAP_FIXED_NOREPLACE)) {
  864. // Caller must retry with different hint address.
  865. return result;
  866. }
  867. YTALLOC_VERIFY(error == ENOMEM);
  868. ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to mmap %zu bytes\n",
  869. size);
  870. OomTrap();
  871. }
  872. return result;
  873. }
  874. void Unmap(void* ptr, size_t size)
  875. {
  876. TTimingGuard timingGuard(ETimingEventType::Munmap, size);
  877. auto result = ::munmap(ptr, size);
  878. YTALLOC_VERIFY(result == 0);
  879. }
  880. void DontDump(void* ptr, size_t size)
  881. {
  882. auto result = ::madvise(ptr, size, MADV_DONTDUMP);
  883. // Must not fail.
  884. YTALLOC_VERIFY(result == 0);
  885. }
  886. void PopulateFile(void* ptr, size_t size)
  887. {
  888. TTimingGuard timingGuard(ETimingEventType::FilePrefault, size);
  889. auto* begin = static_cast<volatile char*>(ptr);
  890. for (auto* current = begin; current < begin + size; current += PageSize) {
  891. *current;
  892. }
  893. }
  894. void PopulateReadOnly(void* ptr, size_t size)
  895. {
  896. if (!MadvisePopulateUnavailable_.load(std::memory_order_relaxed) &&
  897. ConfigurationManager->GetEnableMadvisePopulate())
  898. {
  899. if (!TryMadvisePopulate(ptr, size)) {
  900. MadvisePopulateUnavailable_.store(true);
  901. }
  902. }
  903. }
  904. void Populate(void* ptr, size_t size)
  905. {
  906. if (MadvisePopulateUnavailable_.load(std::memory_order_relaxed) ||
  907. !ConfigurationManager->GetEnableMadvisePopulate())
  908. {
  909. DoPrefault(ptr, size);
  910. } else if (!TryMadvisePopulate(ptr, size)) {
  911. MadvisePopulateUnavailable_.store(true);
  912. DoPrefault(ptr, size);
  913. }
  914. }
  915. void Release(void* ptr, size_t size)
  916. {
  917. if (CanUseMadviseFree() && !ConfigurationManager->GetEnableEagerMemoryRelease()) {
  918. DoMadviseFree(ptr, size);
  919. } else {
  920. DoMadviseDontNeed(ptr, size);
  921. }
  922. }
  923. bool Stockpile(size_t size)
  924. {
  925. if (MadviseStockpileUnavailable_.load(std::memory_order_relaxed)) {
  926. return false;
  927. }
  928. if (!TryMadviseStockpile(size)) {
  929. MadviseStockpileUnavailable_.store(true);
  930. return false;
  931. }
  932. return true;
  933. }
  934. void RunBackgroundTasks()
  935. {
  936. if (!LogManager->IsLoggingEnabled()) {
  937. return;
  938. }
  939. if (IsBuggyKernel() && !BuggyKernelLogged_) {
  940. YTALLOC_LOG_WARNING("Kernel is buggy; see KERNEL-118");
  941. BuggyKernelLogged_ = true;
  942. }
  943. if (MadviseFreeSupported_ && !MadviseFreeSupportedLogged_) {
  944. YTALLOC_LOG_INFO("MADV_FREE is supported");
  945. MadviseFreeSupportedLogged_ = true;
  946. }
  947. if (MadviseFreeNotSupported_ && !MadviseFreeNotSupportedLogged_) {
  948. YTALLOC_LOG_WARNING("MADV_FREE is not supported");
  949. MadviseFreeNotSupportedLogged_ = true;
  950. }
  951. if (MadvisePopulateUnavailable_.load() && !MadvisePopulateUnavailableLogged_) {
  952. YTALLOC_LOG_WARNING("MADV_POPULATE is not supported");
  953. MadvisePopulateUnavailableLogged_ = true;
  954. }
  955. if (MadviseStockpileUnavailable_.load() && !MadviseStockpileUnavailableLogged_) {
  956. YTALLOC_LOG_WARNING("MADV_STOCKPILE is not supported");
  957. MadviseStockpileUnavailableLogged_ = true;
  958. }
  959. }
  960. private:
  961. bool BuggyKernelLogged_ = false;
  962. std::atomic<bool> MadviseFreeSupported_ = false;
  963. bool MadviseFreeSupportedLogged_ = false;
  964. std::atomic<bool> MadviseFreeNotSupported_ = false;
  965. bool MadviseFreeNotSupportedLogged_ = false;
  966. std::atomic<bool> MadvisePopulateUnavailable_ = false;
  967. bool MadvisePopulateUnavailableLogged_ = false;
  968. std::atomic<bool> MadviseStockpileUnavailable_ = false;
  969. bool MadviseStockpileUnavailableLogged_ = false;
  970. private:
  971. bool TryMadvisePopulate(void* ptr, size_t size)
  972. {
  973. TTimingGuard timingGuard(ETimingEventType::MadvisePopulate, size);
  974. auto result = ::madvise(ptr, size, MADV_POPULATE);
  975. if (result != 0) {
  976. auto error = errno;
  977. YTALLOC_VERIFY(error == EINVAL || error == ENOMEM);
  978. if (error == ENOMEM) {
  979. ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to madvise(MADV_POPULATE) %zu bytes\n",
  980. size);
  981. OomTrap();
  982. }
  983. return false;
  984. }
  985. return true;
  986. }
  987. void DoPrefault(void* ptr, size_t size)
  988. {
  989. TTimingGuard timingGuard(ETimingEventType::Prefault, size);
  990. auto* begin = static_cast<char*>(ptr);
  991. for (auto* current = begin; current < begin + size; current += PageSize) {
  992. *current = 0;
  993. }
  994. }
  995. bool CanUseMadviseFree()
  996. {
  997. if (MadviseFreeSupported_.load()) {
  998. return true;
  999. }
  1000. if (MadviseFreeNotSupported_.load()) {
  1001. return false;
  1002. }
  1003. if (IsBuggyKernel()) {
  1004. MadviseFreeNotSupported_.store(true);
  1005. } else {
  1006. auto* ptr = Map(0, PageSize, 0);
  1007. if (::madvise(ptr, PageSize, MADV_FREE) == 0) {
  1008. MadviseFreeSupported_.store(true);
  1009. } else {
  1010. MadviseFreeNotSupported_.store(true);
  1011. }
  1012. Unmap(ptr, PageSize);
  1013. }
  1014. // Will not recurse.
  1015. return CanUseMadviseFree();
  1016. }
  1017. void DoMadviseDontNeed(void* ptr, size_t size)
  1018. {
  1019. TTimingGuard timingGuard(ETimingEventType::MadviseDontNeed, size);
  1020. auto result = ::madvise(ptr, size, MADV_DONTNEED);
  1021. if (result != 0) {
  1022. auto error = errno;
  1023. // Failure is possible for locked pages.
  1024. Y_ABORT_UNLESS(error == EINVAL);
  1025. }
  1026. }
  1027. void DoMadviseFree(void* ptr, size_t size)
  1028. {
  1029. TTimingGuard timingGuard(ETimingEventType::MadviseFree, size);
  1030. auto result = ::madvise(ptr, size, MADV_FREE);
  1031. if (result != 0) {
  1032. auto error = errno;
  1033. // Failure is possible for locked pages.
  1034. YTALLOC_VERIFY(error == EINVAL);
  1035. }
  1036. }
  1037. bool TryMadviseStockpile(size_t size)
  1038. {
  1039. auto result = ::madvise(nullptr, size, MADV_STOCKPILE);
  1040. if (result != 0) {
  1041. auto error = errno;
  1042. if (error == ENOMEM || error == EAGAIN || error == EINTR) {
  1043. // The call is advisory, ignore ENOMEM, EAGAIN, and EINTR.
  1044. return true;
  1045. }
  1046. YTALLOC_VERIFY(error == EINVAL);
  1047. return false;
  1048. }
  1049. return true;
  1050. }
  1051. // Some kernels are known to contain bugs in MADV_FREE; see https://st.yandex-team.ru/KERNEL-118.
  1052. bool IsBuggyKernel()
  1053. {
  1054. #ifdef _linux_
  1055. static const bool result = [] () {
  1056. struct utsname buf;
  1057. YTALLOC_VERIFY(uname(&buf) == 0);
  1058. if (strverscmp(buf.release, "4.4.1-1") >= 0 &&
  1059. strverscmp(buf.release, "4.4.96-44") < 0)
  1060. {
  1061. return true;
  1062. }
  1063. if (strverscmp(buf.release, "4.14.1-1") >= 0 &&
  1064. strverscmp(buf.release, "4.14.79-33") < 0)
  1065. {
  1066. return true;
  1067. }
  1068. return false;
  1069. }();
  1070. return result;
  1071. #else
  1072. return false;
  1073. #endif
  1074. }
  1075. };
  1076. TExplicitlyConstructableSingleton<TMappedMemoryManager> MappedMemoryManager;
  1077. ////////////////////////////////////////////////////////////////////////////////
  1078. // System allocator
  1079. // Each system allocation is prepended with such a header.
  1080. struct TSystemBlobHeader
  1081. {
  1082. explicit TSystemBlobHeader(size_t size)
  1083. : Size(size)
  1084. { }
  1085. size_t Size;
  1086. char Padding[8];
  1087. };
  1088. CHECK_HEADER_ALIGNMENT(TSystemBlobHeader)
  1089. // Used for some internal allocations.
  1090. // Delgates directly to TMappedMemoryManager.
  1091. class TSystemAllocator
  1092. {
  1093. public:
  1094. void* Allocate(size_t size);
  1095. void Free(void* ptr);
  1096. private:
  1097. std::atomic<uintptr_t> CurrentPtr_ = SystemZoneStart;
  1098. };
  1099. TExplicitlyConstructableSingleton<TSystemAllocator> SystemAllocator;
  1100. ////////////////////////////////////////////////////////////////////////////////
  1101. // Deriving from this class makes instances bound to TSystemAllocator.
  1102. struct TSystemAllocatable
  1103. {
  1104. void* operator new(size_t size) noexcept
  1105. {
  1106. return SystemAllocator->Allocate(size);
  1107. }
  1108. void* operator new[](size_t size) noexcept
  1109. {
  1110. return SystemAllocator->Allocate(size);
  1111. }
  1112. void operator delete(void* ptr) noexcept
  1113. {
  1114. SystemAllocator->Free(ptr);
  1115. }
  1116. void operator delete[](void* ptr) noexcept
  1117. {
  1118. SystemAllocator->Free(ptr);
  1119. }
  1120. };
  1121. ////////////////////////////////////////////////////////////////////////////////
  1122. // Maintains a pool of objects.
  1123. // Objects are allocated in groups each containing BatchSize instances.
  1124. // The actual allocation is carried out by TSystemAllocator.
  1125. // Memory is never actually reclaimed; freed instances are put into TFreeList.
  1126. template <class T, size_t BatchSize>
  1127. class TSystemPool
  1128. {
  1129. public:
  1130. T* Allocate()
  1131. {
  1132. while (true) {
  1133. auto* obj = FreeList_.Extract();
  1134. if (Y_LIKELY(obj)) {
  1135. new (obj) T();
  1136. return obj;
  1137. }
  1138. AllocateMore();
  1139. }
  1140. }
  1141. void Free(T* obj)
  1142. {
  1143. obj->T::~T();
  1144. PoisonFreedRange(obj, sizeof(T));
  1145. FreeList_.Put(obj);
  1146. }
  1147. private:
  1148. TFreeList<T> FreeList_;
  1149. private:
  1150. void AllocateMore()
  1151. {
  1152. auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
  1153. for (size_t index = 0; index < BatchSize; ++index) {
  1154. auto* obj = objs + index;
  1155. FreeList_.Put(obj);
  1156. }
  1157. }
  1158. };
  1159. // A sharded analogue TSystemPool.
  1160. template <class T, size_t BatchSize>
  1161. class TShardedSystemPool
  1162. {
  1163. public:
  1164. template <class TState>
  1165. T* Allocate(TState* state)
  1166. {
  1167. if (auto* obj = FreeLists_[state->GetInitialShardIndex()].Extract()) {
  1168. new (obj) T();
  1169. return obj;
  1170. }
  1171. while (true) {
  1172. for (size_t index = 0; index < ShardCount; ++index) {
  1173. if (auto* obj = FreeLists_[state->GetNextShardIndex()].Extract()) {
  1174. new (obj) T();
  1175. return obj;
  1176. }
  1177. }
  1178. AllocateMore();
  1179. }
  1180. }
  1181. template <class TState>
  1182. void Free(TState* state, T* obj)
  1183. {
  1184. obj->T::~T();
  1185. PoisonFreedRange(obj, sizeof(T));
  1186. FreeLists_[state->GetInitialShardIndex()].Put(obj);
  1187. }
  1188. private:
  1189. std::array<TFreeList<T>, ShardCount> FreeLists_;
  1190. private:
  1191. void AllocateMore()
  1192. {
  1193. auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
  1194. for (size_t index = 0; index < BatchSize; ++index) {
  1195. auto* obj = objs + index;
  1196. FreeLists_[index % ShardCount].Put(obj);
  1197. }
  1198. }
  1199. };
  1200. ////////////////////////////////////////////////////////////////////////////////
  1201. // Handles allocations inside a zone of memory given by its start and end pointers.
  1202. // Each allocation is a separate mapped region of memory.
  1203. // A special care is taken to guarantee that all allocated regions fall inside the zone.
  1204. class TZoneAllocator
  1205. {
  1206. public:
  1207. TZoneAllocator(uintptr_t zoneStart, uintptr_t zoneEnd)
  1208. : ZoneStart_(zoneStart)
  1209. , ZoneEnd_(zoneEnd)
  1210. , Current_(zoneStart)
  1211. {
  1212. YTALLOC_VERIFY(ZoneStart_ % PageSize == 0);
  1213. }
  1214. void* Allocate(size_t size, int flags)
  1215. {
  1216. YTALLOC_VERIFY(size % PageSize == 0);
  1217. bool restarted = false;
  1218. while (true) {
  1219. auto hint = (Current_ += size) - size;
  1220. if (reinterpret_cast<uintptr_t>(hint) + size > ZoneEnd_) {
  1221. if (restarted) {
  1222. ::fprintf(stderr, "*** YTAlloc was unable to mmap %zu bytes in zone %" PRIx64 "--%" PRIx64 "\n",
  1223. size,
  1224. ZoneStart_,
  1225. ZoneEnd_);
  1226. OomTrap();
  1227. }
  1228. restarted = true;
  1229. Current_ = ZoneStart_;
  1230. } else {
  1231. char* ptr = static_cast<char*>(MappedMemoryManager->Map(
  1232. hint,
  1233. size,
  1234. MAP_FIXED_NOREPLACE | flags));
  1235. if (reinterpret_cast<uintptr_t>(ptr) == hint) {
  1236. return ptr;
  1237. }
  1238. if (ptr != MAP_FAILED) {
  1239. MappedMemoryManager->Unmap(ptr, size);
  1240. }
  1241. }
  1242. }
  1243. }
  1244. void Free(void* ptr, size_t size)
  1245. {
  1246. MappedMemoryManager->Unmap(ptr, size);
  1247. }
  1248. private:
  1249. const uintptr_t ZoneStart_;
  1250. const uintptr_t ZoneEnd_;
  1251. std::atomic<uintptr_t> Current_;
  1252. };
  1253. ////////////////////////////////////////////////////////////////////////////////
  1254. // YTAlloc supports tagged allocations.
  1255. // Since the total number of tags can be huge, a two-level scheme is employed.
  1256. // Possible tags are arranged into sets each containing TaggedCounterSetSize tags.
  1257. // There are up to MaxTaggedCounterSets in total.
  1258. // Upper 4 sets are reserved for profiled allocations.
  1259. constexpr size_t TaggedCounterSetSize = 16384;
  1260. constexpr size_t AllocationProfilingTaggedCounterSets = 4;
  1261. constexpr size_t MaxTaggedCounterSets = 256 + AllocationProfilingTaggedCounterSets;
  1262. constexpr size_t MaxCapturedAllocationBacktraces = 65000;
  1263. static_assert(
  1264. MaxCapturedAllocationBacktraces < AllocationProfilingTaggedCounterSets * TaggedCounterSetSize,
  1265. "MaxCapturedAllocationBacktraces is too big");
  1266. constexpr TMemoryTag AllocationProfilingMemoryTagBase = TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets);
  1267. constexpr TMemoryTag AllocationProfilingUnknownMemoryTag = AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
  1268. static_assert(
  1269. MaxMemoryTag == TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets) - 1,
  1270. "Wrong MaxMemoryTag");
  1271. template <class TCounter>
  1272. using TUntaggedTotalCounters = TEnumIndexedArray<EBasicCounter, TCounter>;
  1273. template <class TCounter>
  1274. struct TTaggedTotalCounterSet
  1275. : public TSystemAllocatable
  1276. {
  1277. std::array<TEnumIndexedArray<EBasicCounter, TCounter>, TaggedCounterSetSize> Counters;
  1278. };
  1279. using TLocalTaggedBasicCounterSet = TTaggedTotalCounterSet<ssize_t>;
  1280. using TGlobalTaggedBasicCounterSet = TTaggedTotalCounterSet<std::atomic<ssize_t>>;
  1281. template <class TCounter>
  1282. struct TTotalCounters
  1283. {
  1284. // The sum of counters across all tags.
  1285. TUntaggedTotalCounters<TCounter> CumulativeTaggedCounters;
  1286. // Counters for untagged allocations.
  1287. TUntaggedTotalCounters<TCounter> UntaggedCounters;
  1288. // Access to tagged counters may involve creation of a new tag set.
  1289. // For simplicity, we separate the read-side (TaggedCounterSets) and the write-side (TaggedCounterSetHolders).
  1290. // These arrays contain virtually identical data (up to std::unique_ptr and std::atomic semantic differences).
  1291. std::array<std::atomic<TTaggedTotalCounterSet<TCounter>*>, MaxTaggedCounterSets> TaggedCounterSets{};
  1292. std::array<std::unique_ptr<TTaggedTotalCounterSet<TCounter>>, MaxTaggedCounterSets> TaggedCounterSetHolders;
  1293. // Protects TaggedCounterSetHolders from concurrent updates.
  1294. NThreading::TForkAwareSpinLock TaggedCounterSetsLock;
  1295. // Returns null if the set is not yet constructed.
  1296. Y_FORCE_INLINE TTaggedTotalCounterSet<TCounter>* FindTaggedCounterSet(size_t index) const
  1297. {
  1298. return TaggedCounterSets[index].load();
  1299. }
  1300. // Constructs the set on first access.
  1301. TTaggedTotalCounterSet<TCounter>* GetOrCreateTaggedCounterSet(size_t index)
  1302. {
  1303. auto* set = TaggedCounterSets[index].load();
  1304. if (Y_LIKELY(set)) {
  1305. return set;
  1306. }
  1307. auto guard = GuardWithTiming(TaggedCounterSetsLock);
  1308. auto& setHolder = TaggedCounterSetHolders[index];
  1309. if (!setHolder) {
  1310. setHolder = std::make_unique<TTaggedTotalCounterSet<TCounter>>();
  1311. TaggedCounterSets[index] = setHolder.get();
  1312. }
  1313. return setHolder.get();
  1314. }
  1315. };
  1316. using TLocalSystemCounters = TEnumIndexedArray<ESystemCounter, ssize_t>;
  1317. using TGlobalSystemCounters = TEnumIndexedArray<ESystemCounter, std::atomic<ssize_t>>;
  1318. using TLocalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, ssize_t>;
  1319. using TGlobalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, std::atomic<ssize_t>>;
  1320. using TLocalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, ssize_t>;
  1321. using TGlobalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, std::atomic<ssize_t>>;
  1322. using TLocalHugeCounters = TEnumIndexedArray<EHugeCounter, ssize_t>;
  1323. using TGlobalHugeCounters = TEnumIndexedArray<EHugeCounter, std::atomic<ssize_t>>;
  1324. using TLocalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, ssize_t>;
  1325. using TGlobalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, std::atomic<ssize_t>>;
  1326. Y_FORCE_INLINE ssize_t LoadCounter(ssize_t counter)
  1327. {
  1328. return counter;
  1329. }
  1330. Y_FORCE_INLINE ssize_t LoadCounter(const std::atomic<ssize_t>& counter)
  1331. {
  1332. return counter.load();
  1333. }
  1334. ////////////////////////////////////////////////////////////////////////////////
  1335. struct TMmapObservationEvent
  1336. {
  1337. size_t Size;
  1338. std::array<void*, MaxAllocationProfilingBacktraceDepth> Frames;
  1339. int FrameCount;
  1340. TInstant Timestamp;
  1341. TFiberId FiberId;
  1342. TMmapObservationEvent() = default;
  1343. TMmapObservationEvent(
  1344. size_t size,
  1345. std::array<void*, MaxAllocationProfilingBacktraceDepth> frames,
  1346. int frameCount)
  1347. : Size(size)
  1348. , Frames(frames)
  1349. , FrameCount(frameCount)
  1350. { }
  1351. };
  1352. class TMmapObservationManager
  1353. : public TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>
  1354. {
  1355. public:
  1356. void SetBacktraceFormatter(TBacktraceFormatter formatter)
  1357. {
  1358. BacktraceFormatter_.store(formatter);
  1359. }
  1360. private:
  1361. std::atomic<TBacktraceFormatter> BacktraceFormatter_ = nullptr;
  1362. Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
  1363. friend class TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>;
  1364. virtual void OnEvent(const TMmapObservationEvent& /*event*/) override
  1365. { }
  1366. virtual void ProcessEvent(const TMmapObservationEvent& event) override
  1367. {
  1368. YTALLOC_LOG_DEBUG("Large arena mmap observed (Size: %zu, Timestamp: %s, FiberId: %" PRIx64 ")",
  1369. event.Size,
  1370. ToString(event.Timestamp).c_str(),
  1371. event.FiberId);
  1372. if (auto backtraceFormatter = BacktraceFormatter_.load()) {
  1373. auto backtrace = backtraceFormatter(const_cast<void**>(event.Frames.data()), event.FrameCount);
  1374. YTALLOC_LOG_DEBUG("YTAlloc stack backtrace (Stack: %s)",
  1375. backtrace.c_str());
  1376. }
  1377. }
  1378. };
  1379. Y_POD_THREAD(bool) TMmapObservationManager::DisabledForCurrentThread_;
  1380. TExplicitlyConstructableSingleton<TMmapObservationManager> MmapObservationManager;
  1381. ////////////////////////////////////////////////////////////////////////////////
  1382. // A per-thread structure containing counters, chunk caches etc.
  1383. struct TThreadState
  1384. : public TFreeListItemBase<TThreadState>
  1385. , public TLocalShardedState
  1386. {
  1387. // TThreadState instances of all alive threads are put into a double-linked intrusive list.
  1388. // This is a pair of next/prev pointers connecting an instance of TThreadState to its neighbors.
  1389. TIntrusiveLinkedListNode<TThreadState> RegistryNode;
  1390. // Pointers to the respective parts of TThreadManager::ThreadControlWord_.
  1391. // If null then the thread is already destroyed (but TThreadState may still live for a while
  1392. // due to ref-counting).
  1393. ui8* AllocationProfilingEnabled;
  1394. ui8* BackgroundThreadStarted;
  1395. // TThreadStates are ref-counted.
  1396. // TThreadManager::EnumerateThreadStates enumerates the registered states and acquires
  1397. // a temporary reference preventing these states from being destructed. This provides
  1398. // for shorter periods of time the global lock needs to be held.
  1399. int RefCounter = 1;
  1400. // Per-thread counters.
  1401. TTotalCounters<ssize_t> TotalCounters;
  1402. std::array<TLocalLargeCounters, LargeRankCount> LargeArenaCounters;
  1403. TLocalUndumpableCounters UndumpableCounters;
  1404. // Each thread maintains caches of small chunks.
  1405. // One cache is for tagged chunks; the other is for untagged ones.
  1406. // Each cache contains up to MaxCachedChunksPerRank chunks per any rank.
  1407. // Special sentinels are placed to distinguish the boundaries of region containing
  1408. // pointers of a specific rank. This enables a tiny-bit faster inplace boundary checks.
  1409. static constexpr uintptr_t LeftSentinel = 1;
  1410. static constexpr uintptr_t RightSentinel = 2;
  1411. struct TSmallBlobCache
  1412. {
  1413. TSmallBlobCache()
  1414. {
  1415. void** chunkPtrs = CachedChunks.data();
  1416. for (size_t rank = 0; rank < SmallRankCount; ++rank) {
  1417. RankToCachedChunkPtrHead[rank] = chunkPtrs;
  1418. chunkPtrs[0] = reinterpret_cast<void*>(LeftSentinel);
  1419. chunkPtrs[MaxCachedChunksPerRank + 1] = reinterpret_cast<void*>(RightSentinel);
  1420. #ifdef YTALLOC_PARANOID
  1421. RankToCachedChunkPtrTail[rank] = chunkPtrs;
  1422. CachedChunkFull[rank] = false;
  1423. RankToCachedChunkLeftBorder[rank] = chunkPtrs;
  1424. RankToCachedChunkRightBorder[rank] = chunkPtrs + MaxCachedChunksPerRank + 1;
  1425. #endif
  1426. chunkPtrs += MaxCachedChunksPerRank + 2;
  1427. }
  1428. }
  1429. // For each rank we have a segment of pointers in CachedChunks with the following layout:
  1430. // LCC[C]........R
  1431. // Legend:
  1432. // . = garbage
  1433. // L = left sentinel
  1434. // R = right sentinel
  1435. // C = cached pointer
  1436. // [C] = current cached pointer
  1437. //
  1438. // Under YTALLOC_PARANOID the following layout is used:
  1439. // L.[T]CCC[H]...R
  1440. // Legend:
  1441. // [H] = head cached pointer, put chunks here
  1442. // [T] = tail cached pointer, take chunks from here
  1443. // +2 is for two sentinels
  1444. std::array<void*, SmallRankCount * (MaxCachedChunksPerRank + 2)> CachedChunks{};
  1445. // Pointer to [P] for each rank.
  1446. std::array<void**, SmallRankCount> RankToCachedChunkPtrHead{};
  1447. #ifdef YTALLOC_PARANOID
  1448. // Pointers to [L] and [R] for each rank.
  1449. std::array<void**, SmallRankCount> RankToCachedChunkLeftBorder{};
  1450. std::array<void**, SmallRankCount> RankToCachedChunkRightBorder{};
  1451. std::array<void**, SmallRankCount> RankToCachedChunkPtrTail{};
  1452. std::array<bool, SmallRankCount> CachedChunkFull{};
  1453. #endif
  1454. };
  1455. TEnumIndexedArray<EAllocationKind, TSmallBlobCache> SmallBlobCache;
  1456. };
  1457. struct TThreadStateToRegistryNode
  1458. {
  1459. auto operator() (TThreadState* state) const
  1460. {
  1461. return &state->RegistryNode;
  1462. }
  1463. };
  1464. // Manages all registered threads and controls access to TThreadState.
  1465. class TThreadManager
  1466. {
  1467. public:
  1468. TThreadManager()
  1469. {
  1470. pthread_key_create(&ThreadDtorKey_, DestroyThread);
  1471. NThreading::RegisterAtForkHandlers(
  1472. nullptr,
  1473. nullptr,
  1474. [=] { AfterFork(); });
  1475. }
  1476. // Returns TThreadState for the current thread; the caller guarantees that this
  1477. // state is initialized and is not destroyed yet.
  1478. static TThreadState* GetThreadStateUnchecked();
  1479. // Returns TThreadState for the current thread; may return null.
  1480. static TThreadState* FindThreadState();
  1481. // Returns TThreadState for the current thread; may not return null
  1482. // (but may crash if TThreadState is already destroyed).
  1483. static TThreadState* GetThreadStateChecked()
  1484. {
  1485. auto* state = FindThreadState();
  1486. YTALLOC_VERIFY(state);
  1487. return state;
  1488. }
  1489. // Enumerates all threads and invokes func passing TThreadState instances.
  1490. // func must not throw but can take arbitrary time; no locks are being held while it executes.
  1491. template <class THandler>
  1492. void EnumerateThreadStatesAsync(const THandler& handler) noexcept
  1493. {
  1494. TMemoryTagGuard guard(NullMemoryTag);
  1495. std::vector<TThreadState*> states;
  1496. states.reserve(1024); // must be enough in most cases
  1497. auto unrefStates = [&] {
  1498. // Releasing references also requires global lock to be held to avoid getting zombies above.
  1499. auto guard = GuardWithTiming(ThreadRegistryLock_);
  1500. for (auto* state : states) {
  1501. UnrefThreadState(state);
  1502. }
  1503. };
  1504. auto tryRefStates = [&] {
  1505. // Only hold this guard for a small period of time to reference all the states.
  1506. auto guard = GuardWithTiming(ThreadRegistryLock_);
  1507. auto* current = ThreadRegistry_.GetFront();
  1508. while (current) {
  1509. if (states.size() == states.capacity()) {
  1510. // Cannot allocate while holding ThreadRegistryLock_ due to a possible deadlock as follows:
  1511. // EnumerateThreadStatesAsync -> StartBackgroundThread -> EnumerateThreadStatesSync
  1512. // (many other scenarios are also possible).
  1513. guard.Release();
  1514. unrefStates();
  1515. states.clear();
  1516. states.reserve(states.capacity() * 2);
  1517. return false;
  1518. }
  1519. RefThreadState(current);
  1520. states.push_back(current);
  1521. current = current->RegistryNode.Next;
  1522. }
  1523. return true;
  1524. };
  1525. while (!tryRefStates()) ;
  1526. for (auto* state : states) {
  1527. handler(state);
  1528. }
  1529. unrefStates();
  1530. }
  1531. // Similar to EnumerateThreadStatesAsync but holds the global lock while enumerating the threads.
  1532. // Also invokes a given prologue functor while holding the thread registry lock.
  1533. // Handler and prologue calls must be fast and must not allocate.
  1534. template <class TPrologue, class THandler>
  1535. void EnumerateThreadStatesSync(const TPrologue& prologue, const THandler& handler) noexcept
  1536. {
  1537. auto guard = GuardWithTiming(ThreadRegistryLock_);
  1538. prologue();
  1539. auto* current = ThreadRegistry_.GetFront();
  1540. while (current) {
  1541. handler(current);
  1542. current = current->RegistryNode.Next;
  1543. }
  1544. }
  1545. // We store a special 64-bit "thread control word" in TLS encapsulating the following
  1546. // crucial per-thread parameters:
  1547. // * the current memory tag
  1548. // * a flag indicating that a valid TThreadState is known to exists
  1549. // (and can be obtained via GetThreadStateUnchecked)
  1550. // * a flag indicating that allocation profiling is enabled
  1551. // * a flag indicating that background thread is started
  1552. // Thread control word is fetched via GetThreadControlWord and is compared
  1553. // against FastPathControlWord to see if the fast path can be taken.
  1554. // The latter happens when no memory tagging is configured, TThreadState is
  1555. // valid, allocation profiling is disabled, and background thread is started.
  1556. // The mask for extracting memory tag from thread control word.
  1557. static constexpr ui64 MemoryTagControlWordMask = 0xffffffff;
  1558. // ThreadStateValid is on.
  1559. static constexpr ui64 ThreadStateValidControlWordMask = (1ULL << 32);
  1560. // AllocationProfiling is on.
  1561. static constexpr ui64 AllocationProfilingEnabledControlWordMask = (1ULL << 40);
  1562. // All background thread are properly started.
  1563. static constexpr ui64 BackgroundThreadStartedControlWorkMask = (1ULL << 48);
  1564. // Memory tag is NullMemoryTag; thread state is valid.
  1565. static constexpr ui64 FastPathControlWord =
  1566. BackgroundThreadStartedControlWorkMask |
  1567. ThreadStateValidControlWordMask |
  1568. NullMemoryTag;
  1569. Y_FORCE_INLINE static ui64 GetThreadControlWord()
  1570. {
  1571. return (&ThreadControlWord_)->Value;
  1572. }
  1573. static TMemoryTag GetCurrentMemoryTag()
  1574. {
  1575. return (&ThreadControlWord_)->Parts.MemoryTag;
  1576. }
  1577. static void SetCurrentMemoryTag(TMemoryTag tag)
  1578. {
  1579. Y_ABORT_UNLESS(tag <= MaxMemoryTag);
  1580. (&ThreadControlWord_)->Parts.MemoryTag = tag;
  1581. }
  1582. static EMemoryZone GetCurrentMemoryZone()
  1583. {
  1584. return CurrentMemoryZone_;
  1585. }
  1586. static void SetCurrentMemoryZone(EMemoryZone zone)
  1587. {
  1588. CurrentMemoryZone_ = zone;
  1589. }
  1590. static void SetCurrentFiberId(TFiberId id)
  1591. {
  1592. CurrentFiberId_ = id;
  1593. }
  1594. static TFiberId GetCurrentFiberId()
  1595. {
  1596. return CurrentFiberId_;
  1597. }
  1598. private:
  1599. static void DestroyThread(void*);
  1600. TThreadState* AllocateThreadState();
  1601. void RefThreadState(TThreadState* state)
  1602. {
  1603. auto result = ++state->RefCounter;
  1604. Y_ABORT_UNLESS(result > 1);
  1605. }
  1606. void UnrefThreadState(TThreadState* state)
  1607. {
  1608. auto result = --state->RefCounter;
  1609. Y_ABORT_UNLESS(result >= 0);
  1610. if (result == 0) {
  1611. DestroyThreadState(state);
  1612. }
  1613. }
  1614. void DestroyThreadState(TThreadState* state);
  1615. void AfterFork();
  1616. private:
  1617. // TThreadState instance for the current thread.
  1618. // Initially null, then initialized when first needed.
  1619. // TThreadState is destroyed upon thread termination (which is detected with
  1620. // the help of pthread_key_create machinery), so this pointer can become null again.
  1621. Y_POD_STATIC_THREAD(TThreadState*) ThreadState_;
  1622. // Initially false, then set to true then TThreadState is destroyed.
  1623. // If the thread requests for its state afterwards, null is returned and no new state is (re-)created.
  1624. // The caller must be able to deal with it.
  1625. Y_POD_STATIC_THREAD(bool) ThreadStateDestroyed_;
  1626. union TThreadControlWord
  1627. {
  1628. ui64 __attribute__((__may_alias__)) Value;
  1629. struct TParts
  1630. {
  1631. // The current memory tag used in all allocations by this thread.
  1632. ui32 __attribute__((__may_alias__)) MemoryTag;
  1633. // Indicates if a valid TThreadState exists and can be obtained via GetThreadStateUnchecked.
  1634. ui8 __attribute__((__may_alias__)) ThreadStateValid;
  1635. // Indicates if allocation profiling is on.
  1636. ui8 __attribute__((__may_alias__)) AllocationProfilingEnabled;
  1637. // Indicates if all background threads are properly started.
  1638. ui8 __attribute__((__may_alias__)) BackgroundThreadStarted;
  1639. ui8 Padding[2];
  1640. } Parts;
  1641. };
  1642. Y_POD_STATIC_THREAD(TThreadControlWord) ThreadControlWord_;
  1643. // See memory zone API.
  1644. Y_POD_STATIC_THREAD(EMemoryZone) CurrentMemoryZone_;
  1645. // See fiber id API.
  1646. Y_POD_STATIC_THREAD(TFiberId) CurrentFiberId_;
  1647. pthread_key_t ThreadDtorKey_;
  1648. static constexpr size_t ThreadStatesBatchSize = 1;
  1649. TSystemPool<TThreadState, ThreadStatesBatchSize> ThreadStatePool_;
  1650. NThreading::TForkAwareSpinLock ThreadRegistryLock_;
  1651. TIntrusiveLinkedList<TThreadState, TThreadStateToRegistryNode> ThreadRegistry_;
  1652. };
  1653. Y_POD_THREAD(TThreadState*) TThreadManager::ThreadState_;
  1654. Y_POD_THREAD(bool) TThreadManager::ThreadStateDestroyed_;
  1655. Y_POD_THREAD(TThreadManager::TThreadControlWord) TThreadManager::ThreadControlWord_;
  1656. Y_POD_THREAD(EMemoryZone) TThreadManager::CurrentMemoryZone_;
  1657. Y_POD_THREAD(TFiberId) TThreadManager::CurrentFiberId_;
  1658. TExplicitlyConstructableSingleton<TThreadManager> ThreadManager;
  1659. ////////////////////////////////////////////////////////////////////////////////
  1660. void TConfigurationManager::SetAllocationProfilingEnabled(bool value)
  1661. {
  1662. // Update threads' TLS.
  1663. ThreadManager->EnumerateThreadStatesSync(
  1664. [&] {
  1665. AllocationProfilingEnabled_.store(value);
  1666. },
  1667. [&] (auto* state) {
  1668. if (state->AllocationProfilingEnabled) {
  1669. *state->AllocationProfilingEnabled = value;
  1670. }
  1671. });
  1672. }
  1673. ////////////////////////////////////////////////////////////////////////////////
  1674. // Backtrace Manager
  1675. //
  1676. // Captures backtraces observed during allocations and assigns memory tags to them.
  1677. // Memory tags are chosen sequentially starting from AllocationProfilingMemoryTagBase.
  1678. //
  1679. // For each backtrace we compute a 64-bit hash and use it as a key in a certain concurrent hashmap.
  1680. // This hashmap is organized into BucketCount buckets, each consisting of BucketSize slots.
  1681. //
  1682. // Backtrace hash is translated into bucket index by taking the appropriate number of
  1683. // its lower bits. For each slot, we remember a 32-bit fingerprint, which is
  1684. // just the next 32 bits of the backtrace's hash, and the (previously assigned) memory tag.
  1685. //
  1686. // Upon access to the hashtable, the bucket is first scanned optimistically, without taking
  1687. // any locks. In case of a miss, a per-bucket spinlock is acquired and the bucket is rescanned.
  1688. //
  1689. // The above scheme may involve collisions but we neglect their probability.
  1690. //
  1691. // If the whole hash table overflows (i.e. a total of MaxCapturedAllocationBacktraces
  1692. // backtraces are captured) or the bucket overflows (i.e. all of its slots become occupied),
  1693. // the allocation is annotated with AllocationProfilingUnknownMemoryTag. Such allocations
  1694. // appear as having no backtrace whatsoever in the profiling reports.
  1695. class TBacktraceManager
  1696. {
  1697. public:
  1698. // Sets the provider used for collecting backtraces when allocation profiling
  1699. // is turned ON.
  1700. void SetBacktraceProvider(TBacktraceProvider provider)
  1701. {
  1702. BacktraceProvider_.store(provider);
  1703. }
  1704. // Captures the backtrace and inserts it into the hashtable.
  1705. TMemoryTag GetMemoryTagFromBacktrace(int framesToSkip)
  1706. {
  1707. std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
  1708. auto backtraceProvider = BacktraceProvider_.load();
  1709. if (!backtraceProvider) {
  1710. return NullMemoryTag;
  1711. }
  1712. auto frameCount = backtraceProvider(frames.data(), ConfigurationManager->GetProfilingBacktraceDepth(), framesToSkip);
  1713. auto hash = GetBacktraceHash(frames.data(), frameCount);
  1714. return CaptureBacktrace(hash, frames.data(), frameCount);
  1715. }
  1716. // Returns the backtrace corresponding to the given tag, if any.
  1717. std::optional<TBacktrace> FindBacktrace(TMemoryTag tag)
  1718. {
  1719. if (tag < AllocationProfilingMemoryTagBase ||
  1720. tag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces)
  1721. {
  1722. return std::nullopt;
  1723. }
  1724. const auto& entry = Backtraces_[tag - AllocationProfilingMemoryTagBase];
  1725. if (!entry.Captured.load()) {
  1726. return std::nullopt;
  1727. }
  1728. return entry.Backtrace;
  1729. }
  1730. private:
  1731. static constexpr int Log2BucketCount = 16;
  1732. static constexpr int BucketCount = 1 << Log2BucketCount;
  1733. static constexpr int BucketSize = 8;
  1734. std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
  1735. std::array<std::array<std::atomic<ui32>, BucketSize>, BucketCount> Fingerprints_= {};
  1736. std::array<std::array<std::atomic<TMemoryTag>, BucketSize>, BucketCount> MemoryTags_ = {};
  1737. std::array<NThreading::TForkAwareSpinLock, BucketCount> BucketLocks_;
  1738. std::atomic<TMemoryTag> CurrentMemoryTag_ = AllocationProfilingMemoryTagBase;
  1739. struct TBacktraceEntry
  1740. {
  1741. TBacktrace Backtrace;
  1742. std::atomic<bool> Captured = false;
  1743. };
  1744. std::array<TBacktraceEntry, MaxCapturedAllocationBacktraces> Backtraces_;
  1745. private:
  1746. static size_t GetBacktraceHash(void** frames, int frameCount)
  1747. {
  1748. size_t hash = 0;
  1749. for (int index = 0; index < frameCount; ++index) {
  1750. hash = CombineHashes(hash, THash<void*>()(frames[index]));
  1751. }
  1752. return hash;
  1753. }
  1754. TMemoryTag CaptureBacktrace(size_t hash, void** frames, int frameCount)
  1755. {
  1756. size_t bucketIndex = hash % BucketCount;
  1757. ui32 fingerprint = (hash >> Log2BucketCount) & 0xffffffff;
  1758. // Zero fingerprint indicates the slot is free; check and adjust to ensure
  1759. // that regular fingerprints are non-zero.
  1760. if (fingerprint == 0) {
  1761. fingerprint = 1;
  1762. }
  1763. for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
  1764. auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
  1765. if (currentFingerprint == fingerprint) {
  1766. return MemoryTags_[bucketIndex][slotIndex].load();
  1767. }
  1768. }
  1769. auto guard = Guard(BucketLocks_[bucketIndex]);
  1770. int spareSlotIndex = -1;
  1771. for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
  1772. auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
  1773. if (currentFingerprint == fingerprint) {
  1774. return MemoryTags_[bucketIndex][slotIndex];
  1775. }
  1776. if (currentFingerprint == 0) {
  1777. spareSlotIndex = slotIndex;
  1778. break;
  1779. }
  1780. }
  1781. if (spareSlotIndex < 0) {
  1782. return AllocationProfilingUnknownMemoryTag;
  1783. }
  1784. auto memoryTag = CurrentMemoryTag_++;
  1785. if (memoryTag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces) {
  1786. return AllocationProfilingUnknownMemoryTag;
  1787. }
  1788. MemoryTags_[bucketIndex][spareSlotIndex].store(memoryTag);
  1789. Fingerprints_[bucketIndex][spareSlotIndex].store(fingerprint);
  1790. auto& entry = Backtraces_[memoryTag - AllocationProfilingMemoryTagBase];
  1791. entry.Backtrace.FrameCount = frameCount;
  1792. ::memcpy(entry.Backtrace.Frames.data(), frames, sizeof (void*) * frameCount);
  1793. entry.Captured.store(true);
  1794. return memoryTag;
  1795. }
  1796. };
  1797. TExplicitlyConstructableSingleton<TBacktraceManager> BacktraceManager;
  1798. ////////////////////////////////////////////////////////////////////////////////
  1799. // Mimics the counters of TThreadState but uses std::atomic to survive concurrent access.
  1800. struct TGlobalState
  1801. : public TGlobalShardedState
  1802. {
  1803. TTotalCounters<std::atomic<ssize_t>> TotalCounters;
  1804. std::array<TGlobalLargeCounters, LargeRankCount> LargeArenaCounters;
  1805. TGlobalUndumpableCounters UndumpableCounters;
  1806. };
  1807. TExplicitlyConstructableSingleton<TGlobalState> GlobalState;
  1808. ////////////////////////////////////////////////////////////////////////////////
  1809. // Accumulates various allocation statistics.
  1810. class TStatisticsManager
  1811. {
  1812. public:
  1813. template <EAllocationKind Kind = EAllocationKind::Tagged, class TState>
  1814. static Y_FORCE_INLINE void IncrementTotalCounter(TState* state, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
  1815. {
  1816. // This branch is typically resolved at compile time.
  1817. if (Kind == EAllocationKind::Tagged && tag != NullMemoryTag) {
  1818. IncrementTaggedTotalCounter(&state->TotalCounters, tag, counter, delta);
  1819. } else {
  1820. IncrementUntaggedTotalCounter(&state->TotalCounters, counter, delta);
  1821. }
  1822. }
  1823. static Y_FORCE_INLINE void IncrementTotalCounter(TMemoryTag tag, EBasicCounter counter, ssize_t delta)
  1824. {
  1825. IncrementTotalCounter(GlobalState.Get(), tag, counter, delta);
  1826. }
  1827. void IncrementSmallArenaCounter(ESmallArenaCounter counter, size_t rank, ssize_t delta)
  1828. {
  1829. SmallArenaCounters_[rank][counter] += delta;
  1830. }
  1831. template <class TState>
  1832. static Y_FORCE_INLINE void IncrementLargeArenaCounter(TState* state, size_t rank, ELargeArenaCounter counter, ssize_t delta)
  1833. {
  1834. state->LargeArenaCounters[rank][counter] += delta;
  1835. }
  1836. template <class TState>
  1837. static Y_FORCE_INLINE void IncrementUndumpableCounter(TState* state, EUndumpableCounter counter, ssize_t delta)
  1838. {
  1839. state->UndumpableCounters[counter] += delta;
  1840. }
  1841. void IncrementHugeCounter(EHugeCounter counter, ssize_t delta)
  1842. {
  1843. HugeCounters_[counter] += delta;
  1844. }
  1845. void IncrementHugeUndumpableCounter(EUndumpableCounter counter, ssize_t delta)
  1846. {
  1847. HugeUndumpableCounters_[counter] += delta;
  1848. }
  1849. void IncrementSystemCounter(ESystemCounter counter, ssize_t delta)
  1850. {
  1851. SystemCounters_[counter] += delta;
  1852. }
  1853. // Computes memory usage for a list of tags by aggregating counters across threads.
  1854. void GetTaggedMemoryCounters(const TMemoryTag* tags, size_t count, TEnumIndexedArray<EBasicCounter, ssize_t>* counters)
  1855. {
  1856. TMemoryTagGuard guard(NullMemoryTag);
  1857. for (size_t index = 0; index < count; ++index) {
  1858. counters[index][EBasicCounter::BytesAllocated] = 0;
  1859. counters[index][EBasicCounter::BytesFreed] = 0;
  1860. }
  1861. for (size_t index = 0; index < count; ++index) {
  1862. auto tag = tags[index];
  1863. counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesAllocated);
  1864. counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesFreed);
  1865. }
  1866. ThreadManager->EnumerateThreadStatesAsync(
  1867. [&] (const auto* state) {
  1868. for (size_t index = 0; index < count; ++index) {
  1869. auto tag = tags[index];
  1870. counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesAllocated);
  1871. counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesFreed);
  1872. }
  1873. });
  1874. for (size_t index = 0; index < count; ++index) {
  1875. counters[index][EBasicCounter::BytesUsed] = GetUsed(counters[index][EBasicCounter::BytesAllocated], counters[index][EBasicCounter::BytesFreed]);
  1876. }
  1877. }
  1878. void GetTaggedMemoryUsage(const TMemoryTag* tags, size_t count, size_t* results)
  1879. {
  1880. TMemoryTagGuard guard(NullMemoryTag);
  1881. std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
  1882. counters.resize(count);
  1883. GetTaggedMemoryCounters(tags, count, counters.data());
  1884. for (size_t index = 0; index < count; ++index) {
  1885. results[index] = counters[index][EBasicCounter::BytesUsed];
  1886. }
  1887. }
  1888. TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
  1889. {
  1890. TEnumIndexedArray<ETotalCounter, ssize_t> result;
  1891. auto accumulate = [&] (const auto& counters) {
  1892. result[ETotalCounter::BytesAllocated] += LoadCounter(counters[EBasicCounter::BytesAllocated]);
  1893. result[ETotalCounter::BytesFreed] += LoadCounter(counters[EBasicCounter::BytesFreed]);
  1894. };
  1895. accumulate(GlobalState->TotalCounters.UntaggedCounters);
  1896. accumulate(GlobalState->TotalCounters.CumulativeTaggedCounters);
  1897. ThreadManager->EnumerateThreadStatesAsync(
  1898. [&] (const auto* state) {
  1899. accumulate(state->TotalCounters.UntaggedCounters);
  1900. accumulate(state->TotalCounters.CumulativeTaggedCounters);
  1901. });
  1902. result[ETotalCounter::BytesUsed] = GetUsed(
  1903. result[ETotalCounter::BytesAllocated],
  1904. result[ETotalCounter::BytesFreed]);
  1905. auto systemCounters = GetSystemAllocationCounters();
  1906. result[ETotalCounter::BytesCommitted] += systemCounters[EBasicCounter::BytesUsed];
  1907. auto hugeCounters = GetHugeAllocationCounters();
  1908. result[ETotalCounter::BytesCommitted] += hugeCounters[EHugeCounter::BytesUsed];
  1909. auto smallArenaCounters = GetSmallArenaAllocationCounters();
  1910. for (size_t rank = 0; rank < SmallRankCount; ++rank) {
  1911. result[ETotalCounter::BytesCommitted] += smallArenaCounters[rank][ESmallArenaCounter::BytesCommitted];
  1912. }
  1913. auto largeArenaCounters = GetLargeArenaAllocationCounters();
  1914. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1915. result[ETotalCounter::BytesCommitted] += largeArenaCounters[rank][ELargeArenaCounter::BytesCommitted];
  1916. }
  1917. result[ETotalCounter::BytesUnaccounted] = std::max<ssize_t>(GetProcessRss() - result[ETotalCounter::BytesCommitted], 0);
  1918. return result;
  1919. }
  1920. TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
  1921. {
  1922. TEnumIndexedArray<ESmallCounter, ssize_t> result;
  1923. auto totalCounters = GetTotalAllocationCounters();
  1924. result[ESmallCounter::BytesAllocated] = totalCounters[ETotalCounter::BytesAllocated];
  1925. result[ESmallCounter::BytesFreed] = totalCounters[ETotalCounter::BytesFreed];
  1926. result[ESmallCounter::BytesUsed] = totalCounters[ETotalCounter::BytesUsed];
  1927. auto largeArenaCounters = GetLargeArenaAllocationCounters();
  1928. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1929. result[ESmallCounter::BytesAllocated] -= largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
  1930. result[ESmallCounter::BytesFreed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
  1931. result[ESmallCounter::BytesUsed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
  1932. }
  1933. auto hugeCounters = GetHugeAllocationCounters();
  1934. result[ESmallCounter::BytesAllocated] -= hugeCounters[EHugeCounter::BytesAllocated];
  1935. result[ESmallCounter::BytesFreed] -= hugeCounters[EHugeCounter::BytesFreed];
  1936. result[ESmallCounter::BytesUsed] -= hugeCounters[EHugeCounter::BytesUsed];
  1937. return result;
  1938. }
  1939. std::array<TLocalSmallCounters, SmallRankCount> GetSmallArenaAllocationCounters()
  1940. {
  1941. std::array<TLocalSmallCounters, SmallRankCount> result;
  1942. for (size_t rank = 0; rank < SmallRankCount; ++rank) {
  1943. for (auto counter : TEnumTraits<ESmallArenaCounter>::GetDomainValues()) {
  1944. result[rank][counter] = SmallArenaCounters_[rank][counter].load();
  1945. }
  1946. }
  1947. return result;
  1948. }
  1949. TEnumIndexedArray<ELargeCounter, ssize_t> GetLargeAllocationCounters()
  1950. {
  1951. TEnumIndexedArray<ELargeCounter, ssize_t> result;
  1952. auto largeArenaCounters = GetLargeArenaAllocationCounters();
  1953. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1954. result[ESmallCounter::BytesAllocated] += largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
  1955. result[ESmallCounter::BytesFreed] += largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
  1956. result[ESmallCounter::BytesUsed] += largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
  1957. }
  1958. return result;
  1959. }
  1960. std::array<TLocalLargeCounters, LargeRankCount> GetLargeArenaAllocationCounters()
  1961. {
  1962. std::array<TLocalLargeCounters, LargeRankCount> result{};
  1963. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1964. for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
  1965. result[rank][counter] = GlobalState->LargeArenaCounters[rank][counter].load();
  1966. }
  1967. }
  1968. ThreadManager->EnumerateThreadStatesAsync(
  1969. [&] (const auto* state) {
  1970. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1971. for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
  1972. result[rank][counter] += state->LargeArenaCounters[rank][counter];
  1973. }
  1974. }
  1975. });
  1976. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  1977. result[rank][ELargeArenaCounter::BytesUsed] = GetUsed(result[rank][ELargeArenaCounter::BytesAllocated], result[rank][ELargeArenaCounter::BytesFreed]);
  1978. result[rank][ELargeArenaCounter::BlobsUsed] = GetUsed(result[rank][ELargeArenaCounter::BlobsAllocated], result[rank][ELargeArenaCounter::BlobsFreed]);
  1979. }
  1980. return result;
  1981. }
  1982. TLocalSystemCounters GetSystemAllocationCounters()
  1983. {
  1984. TLocalSystemCounters result;
  1985. for (auto counter : TEnumTraits<ESystemCounter>::GetDomainValues()) {
  1986. result[counter] = SystemCounters_[counter].load();
  1987. }
  1988. result[ESystemCounter::BytesUsed] = GetUsed(result[ESystemCounter::BytesAllocated], result[ESystemCounter::BytesFreed]);
  1989. return result;
  1990. }
  1991. TLocalHugeCounters GetHugeAllocationCounters()
  1992. {
  1993. TLocalHugeCounters result;
  1994. for (auto counter : TEnumTraits<EHugeCounter>::GetDomainValues()) {
  1995. result[counter] = HugeCounters_[counter].load();
  1996. }
  1997. result[EHugeCounter::BytesUsed] = GetUsed(result[EHugeCounter::BytesAllocated], result[EHugeCounter::BytesFreed]);
  1998. result[EHugeCounter::BlobsUsed] = GetUsed(result[EHugeCounter::BlobsAllocated], result[EHugeCounter::BlobsFreed]);
  1999. return result;
  2000. }
  2001. TLocalUndumpableCounters GetUndumpableAllocationCounters()
  2002. {
  2003. TLocalUndumpableCounters result;
  2004. for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
  2005. result[counter] = HugeUndumpableCounters_[counter].load();
  2006. result[counter] += GlobalState->UndumpableCounters[counter].load();
  2007. }
  2008. ThreadManager->EnumerateThreadStatesAsync(
  2009. [&] (const auto* state) {
  2010. result[EUndumpableCounter::BytesAllocated] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesAllocated]);
  2011. result[EUndumpableCounter::BytesFreed] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesFreed]);
  2012. });
  2013. result[EUndumpableCounter::BytesUsed] = GetUsed(result[EUndumpableCounter::BytesAllocated], result[EUndumpableCounter::BytesFreed]);
  2014. return result;
  2015. }
  2016. // Called before TThreadState is destroyed.
  2017. // Adds the counter values from TThreadState to the global counters.
  2018. void AccumulateLocalCounters(TThreadState* state)
  2019. {
  2020. for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
  2021. GlobalState->TotalCounters.CumulativeTaggedCounters[counter] += state->TotalCounters.CumulativeTaggedCounters[counter];
  2022. GlobalState->TotalCounters.UntaggedCounters[counter] += state->TotalCounters.UntaggedCounters[counter];
  2023. }
  2024. for (size_t index = 0; index < MaxTaggedCounterSets; ++index) {
  2025. const auto* localSet = state->TotalCounters.FindTaggedCounterSet(index);
  2026. if (!localSet) {
  2027. continue;
  2028. }
  2029. auto* globalSet = GlobalState->TotalCounters.GetOrCreateTaggedCounterSet(index);
  2030. for (size_t jndex = 0; jndex < TaggedCounterSetSize; ++jndex) {
  2031. for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
  2032. globalSet->Counters[jndex][counter] += localSet->Counters[jndex][counter];
  2033. }
  2034. }
  2035. }
  2036. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  2037. for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
  2038. GlobalState->LargeArenaCounters[rank][counter] += state->LargeArenaCounters[rank][counter];
  2039. }
  2040. }
  2041. for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
  2042. GlobalState->UndumpableCounters[counter] += state->UndumpableCounters[counter];
  2043. }
  2044. }
  2045. private:
  2046. template <class TCounter>
  2047. static ssize_t LoadTaggedTotalCounter(const TTotalCounters<TCounter>& counters, TMemoryTag tag, EBasicCounter counter)
  2048. {
  2049. const auto* set = counters.FindTaggedCounterSet(tag / TaggedCounterSetSize);
  2050. if (Y_UNLIKELY(!set)) {
  2051. return 0;
  2052. }
  2053. return LoadCounter(set->Counters[tag % TaggedCounterSetSize][counter]);
  2054. }
  2055. template <class TCounter>
  2056. static Y_FORCE_INLINE void IncrementUntaggedTotalCounter(TTotalCounters<TCounter>* counters, EBasicCounter counter, ssize_t delta)
  2057. {
  2058. counters->UntaggedCounters[counter] += delta;
  2059. }
  2060. template <class TCounter>
  2061. static Y_FORCE_INLINE void IncrementTaggedTotalCounter(TTotalCounters<TCounter>* counters, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
  2062. {
  2063. counters->CumulativeTaggedCounters[counter] += delta;
  2064. auto* set = counters->GetOrCreateTaggedCounterSet(tag / TaggedCounterSetSize);
  2065. set->Counters[tag % TaggedCounterSetSize][counter] += delta;
  2066. }
  2067. static ssize_t GetProcessRss()
  2068. {
  2069. auto* file = ::fopen("/proc/self/statm", "r");
  2070. if (!file) {
  2071. return 0;
  2072. }
  2073. ssize_t dummy;
  2074. ssize_t rssPages;
  2075. auto readResult = fscanf(file, "%zd %zd", &dummy, &rssPages);
  2076. ::fclose(file);
  2077. if (readResult != 2) {
  2078. return 0;
  2079. }
  2080. return rssPages * PageSize;
  2081. }
  2082. private:
  2083. TGlobalSystemCounters SystemCounters_;
  2084. std::array<TGlobalSmallCounters, SmallRankCount> SmallArenaCounters_;
  2085. TGlobalHugeCounters HugeCounters_;
  2086. TGlobalUndumpableCounters HugeUndumpableCounters_;
  2087. };
  2088. TExplicitlyConstructableSingleton<TStatisticsManager> StatisticsManager;
  2089. ////////////////////////////////////////////////////////////////////////////////
  2090. void* TSystemAllocator::Allocate(size_t size)
  2091. {
  2092. auto rawSize = GetRawBlobSize<TSystemBlobHeader>(size);
  2093. void* mmappedPtr;
  2094. while (true) {
  2095. auto currentPtr = CurrentPtr_.fetch_add(rawSize);
  2096. Y_ABORT_UNLESS(currentPtr + rawSize <= SystemZoneEnd);
  2097. mmappedPtr = MappedMemoryManager->Map(
  2098. currentPtr,
  2099. rawSize,
  2100. MAP_FIXED_NOREPLACE | MAP_POPULATE);
  2101. if (mmappedPtr == reinterpret_cast<void*>(currentPtr)) {
  2102. break;
  2103. }
  2104. if (mmappedPtr != MAP_FAILED) {
  2105. MappedMemoryManager->Unmap(mmappedPtr, rawSize);
  2106. }
  2107. }
  2108. auto* blob = static_cast<TSystemBlobHeader*>(mmappedPtr);
  2109. new (blob) TSystemBlobHeader(size);
  2110. auto* result = HeaderToPtr(blob);
  2111. PoisonUninitializedRange(result, size);
  2112. StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, rawSize);
  2113. return result;
  2114. }
  2115. void TSystemAllocator::Free(void* ptr)
  2116. {
  2117. auto* blob = PtrToHeader<TSystemBlobHeader>(ptr);
  2118. auto rawSize = GetRawBlobSize<TSystemBlobHeader>(blob->Size);
  2119. MappedMemoryManager->Unmap(blob, rawSize);
  2120. StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesFreed, rawSize);
  2121. }
  2122. ////////////////////////////////////////////////////////////////////////////////
  2123. // Small allocator
  2124. //
  2125. // Allocations (called small chunks) are grouped by their sizes. Two most-significant binary digits are
  2126. // used to determine the rank of a chunk, which guarantees 25% overhead in the worst case.
  2127. // A pair of helper arrays (SizeToSmallRank1 and SizeToSmallRank2) are used to compute ranks; we expect
  2128. // them to be permanently cached.
  2129. //
  2130. // Chunks of the same rank are served by a (small) arena allocator.
  2131. // In fact, there are two arenas for each rank: one is for tagged allocations and another is for untagged ones.
  2132. //
  2133. // We encode chunk's rank and whether it is tagged or not in the resulting pointer as follows:
  2134. // 0- 3: must be zero due to alignment
  2135. // 4-39: varies
  2136. // 40-44: rank
  2137. // 45: 0 for untagged allocations, 1 for tagged ones
  2138. // 45-63: zeroes
  2139. // This enables computing chunk's rank and also determining if it is tagged in constant time
  2140. // without any additional lookups. Also, one pays no space overhead for untagged allocations
  2141. // and pays 16 bytes for each tagged one.
  2142. //
  2143. // Each arena allocates extents of memory by calling mmap for each extent of SmallExtentSize bytes.
  2144. // (Recall that this memory is never reclaimed.)
  2145. // Each extent is then sliced into segments of SmallSegmentSize bytes.
  2146. // Whenever a new segment is acquired, its memory is pre-faulted by madvise(MADV_POPULATE).
  2147. // New segments are acquired in a lock-free manner.
  2148. //
  2149. // Each thread maintains a separate cache of chunks of each rank (two caches to be precise: one
  2150. // for tagged allocations and the other for untagged). These caches are fully thread-local and
  2151. // involve no atomic operations.
  2152. //
  2153. // There are also global caches (per rank, for tagged and untagged allocations).
  2154. // Instead of keeping individual chunks these work with chunk groups (collections of up to ChunksPerGroup
  2155. // arbitrary chunks).
  2156. //
  2157. // When the local cache becomes exhausted, a group of chunks is fetched from the global cache
  2158. // (if the latter is empty then the arena allocator is consulted).
  2159. // Vice versa, if the local cache overflows, a group of chunks is moved from it to the global cache.
  2160. //
  2161. // Global caches and arena allocators also take care of (rare) cases when Allocate/Free is called
  2162. // without a valid thread state (which happens during thread shutdown when TThreadState is already destroyed).
  2163. //
  2164. // Each arena allocates memory in a certain "data" zone of SmallZoneSize.
  2165. // In addition to that zone, up to two "shadow" zones are maintained.
  2166. //
  2167. // The first one contains memory tags of chunks residing in the primary zone.
  2168. // The second one (which is present if YTALLOC_NERVOUS is defined) contains
  2169. // states of chunks. These states enable some simple internal sanity checks
  2170. // (e.g. detect attempts to double-free a chunk).
  2171. //
  2172. // Addresses in the data zone are directly mapped to offsets in shadow zones.
  2173. // When a segment of a small arena zone is allocated, the relevant portions of shadow
  2174. // zones get initialized (and also accounted for as a system allocation).
  2175. //
  2176. // Shadow zones are memory-mapped with MAP_NORESERVE flag and are quite sparse.
  2177. // These zones are omitted from core dumps due to their huge size and sparsity.
  2178. // For each small rank i, gives max K such that 2^k <= SmallRankToSize[i].
  2179. // Chunk pointer is mapped to its shadow image via GetShadowOffset helper.
  2180. // Note that chunk size is not always a power of 2. To avoid costly integer division,
  2181. // chunk pointer is translated by means of bitwise shift only (leaving some bytes
  2182. // of shadow zones unused). This array provides the needed shifts.
  2183. constexpr int SmallRankToLogSize[SmallRankCount] = {
  2184. 0,
  2185. 4, 5, 5, 6, 6, 7,
  2186. 7, 8, 8, 9, 9, 10, 10, 11,
  2187. 11, 12, 12, 13, 13, 14, 14, 15
  2188. };
  2189. enum class ESmallChunkState : ui8
  2190. {
  2191. Spare = 0,
  2192. Allocated = 0x61, // a
  2193. Freed = 0x66 // f
  2194. };
  2195. class TSmallArenaAllocator
  2196. {
  2197. public:
  2198. TSmallArenaAllocator(EAllocationKind kind, size_t rank, uintptr_t dataZoneStart)
  2199. : Kind_(kind)
  2200. , Rank_(rank)
  2201. , LogSize_(SmallRankToLogSize[Rank_])
  2202. , ChunkSize_(SmallRankToSize[Rank_])
  2203. , DataZoneStart_(dataZoneStart)
  2204. , DataZoneAllocator_(DataZoneStart_, DataZoneStart_ + SmallZoneSize)
  2205. { }
  2206. size_t PullMany(void** batch, size_t maxCount)
  2207. {
  2208. size_t count;
  2209. while (true) {
  2210. count = TryAllocateFromCurrentExtent(batch, maxCount);
  2211. if (Y_LIKELY(count != 0)) {
  2212. break;
  2213. }
  2214. PopulateAnotherExtent();
  2215. }
  2216. return count;
  2217. }
  2218. void* Allocate(size_t size)
  2219. {
  2220. void* ptr;
  2221. auto count = PullMany(&ptr, 1);
  2222. YTALLOC_PARANOID_ASSERT(count == 1);
  2223. YTALLOC_PARANOID_ASSERT(PtrToSmallRank(ptr) == Rank_);
  2224. PoisonUninitializedRange(ptr, size);
  2225. UpdateChunkState(ptr, ESmallChunkState::Freed, ESmallChunkState::Allocated);
  2226. return ptr;
  2227. }
  2228. TMemoryTag GetAndResetMemoryTag(const void* ptr)
  2229. {
  2230. auto& tag = MemoryTagZoneStart_[GetShadowOffset(ptr)];
  2231. auto currentTag = tag;
  2232. tag = NullMemoryTag;
  2233. return currentTag;
  2234. }
  2235. void SetMemoryTag(void* ptr, TMemoryTag tag)
  2236. {
  2237. MemoryTagZoneStart_[GetShadowOffset(ptr)] = tag;
  2238. }
  2239. void UpdateChunkState(const void* ptr, ESmallChunkState expectedState, ESmallChunkState newState)
  2240. {
  2241. #ifdef YTALLOC_NERVOUS
  2242. auto& state = ChunkStateZoneStart_[GetShadowOffset(ptr)];
  2243. auto actualState = state;
  2244. if (Y_UNLIKELY(actualState != expectedState)) {
  2245. char message[256];
  2246. sprintf(message, "Invalid small chunk state at %p: expected %" PRIx8 ", actual %" PRIx8,
  2247. ptr,
  2248. static_cast<ui8>(expectedState),
  2249. static_cast<ui8>(actualState));
  2250. YTALLOC_TRAP(message);
  2251. }
  2252. state = newState;
  2253. #else
  2254. Y_UNUSED(ptr);
  2255. Y_UNUSED(expectedState);
  2256. Y_UNUSED(newState);
  2257. #endif
  2258. }
  2259. private:
  2260. size_t TryAllocateFromCurrentExtent(void** batch, size_t maxCount)
  2261. {
  2262. auto* oldPtr = CurrentPtr_.load();
  2263. if (Y_UNLIKELY(!oldPtr)) {
  2264. return 0;
  2265. }
  2266. auto* currentExtent = CurrentExtent_.load(std::memory_order_relaxed);
  2267. if (Y_UNLIKELY(!currentExtent)) {
  2268. return 0;
  2269. }
  2270. char* newPtr;
  2271. while (true) {
  2272. if (Y_UNLIKELY(oldPtr < currentExtent || oldPtr + ChunkSize_ + RightReadableAreaSize > currentExtent + SmallExtentSize)) {
  2273. return 0;
  2274. }
  2275. newPtr = std::min(
  2276. oldPtr + ChunkSize_ * maxCount,
  2277. currentExtent + SmallExtentSize);
  2278. auto* alignedNewPtr = AlignDownToSmallSegment(currentExtent, newPtr);
  2279. if (alignedNewPtr > oldPtr) {
  2280. newPtr = alignedNewPtr;
  2281. }
  2282. if (Y_LIKELY(CurrentPtr_.compare_exchange_weak(oldPtr, newPtr))) {
  2283. break;
  2284. }
  2285. }
  2286. auto* firstSegment = AlignUpToSmallSegment(currentExtent, oldPtr);
  2287. auto* nextSegment = AlignUpToSmallSegment(currentExtent, newPtr);
  2288. if (firstSegment != nextSegment) {
  2289. auto size = nextSegment - firstSegment;
  2290. MappedMemoryManager->PopulateReadOnly(firstSegment, size);
  2291. StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesCommitted, Rank_, size);
  2292. StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesCommitted, Rank_, size / PageSize);
  2293. if (Kind_ == EAllocationKind::Tagged) {
  2294. StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(TMemoryTag));
  2295. }
  2296. #ifdef YTALLOC_NERVOUS
  2297. StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(ESmallChunkState));
  2298. #endif
  2299. }
  2300. size_t count = 0;
  2301. while (oldPtr != newPtr) {
  2302. UpdateChunkState(oldPtr, ESmallChunkState::Spare, ESmallChunkState::Freed);
  2303. batch[count] = oldPtr;
  2304. oldPtr += ChunkSize_;
  2305. count++;
  2306. }
  2307. return count;
  2308. }
  2309. void PopulateAnotherExtent()
  2310. {
  2311. auto lockGuard = GuardWithTiming(ExtentLock_);
  2312. auto* currentPtr = CurrentPtr_.load();
  2313. auto* currentExtent = CurrentExtent_.load();
  2314. if (currentPtr && currentPtr + ChunkSize_ + RightReadableAreaSize <= currentExtent + SmallExtentSize) {
  2315. // No need for a new extent.
  2316. return;
  2317. }
  2318. auto* newExtent = static_cast<char*>(DataZoneAllocator_.Allocate(SmallExtentAllocSize, 0));
  2319. AllocateShadowZones();
  2320. YTALLOC_VERIFY(reinterpret_cast<uintptr_t>(newExtent) % SmallExtentAllocSize == 0);
  2321. CurrentPtr_ = CurrentExtent_ = newExtent;
  2322. StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesMapped, Rank_, SmallExtentAllocSize);
  2323. StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesMapped, Rank_, SmallExtentAllocSize / PageSize);
  2324. }
  2325. private:
  2326. const EAllocationKind Kind_;
  2327. const size_t Rank_;
  2328. const size_t LogSize_;
  2329. const size_t ChunkSize_;
  2330. const uintptr_t DataZoneStart_;
  2331. TZoneAllocator DataZoneAllocator_;
  2332. bool ShadowZonesAllocated_ = false;
  2333. TMemoryTag* MemoryTagZoneStart_;
  2334. #ifdef YTALLOC_NERVOUS
  2335. ESmallChunkState* ChunkStateZoneStart_;
  2336. #endif
  2337. NThreading::TForkAwareSpinLock ExtentLock_;
  2338. std::atomic<char*> CurrentPtr_ = nullptr;
  2339. std::atomic<char*> CurrentExtent_ = nullptr;
  2340. size_t GetShadowOffset(const void* ptr)
  2341. {
  2342. return (reinterpret_cast<uintptr_t>(ptr) - DataZoneStart_) >> LogSize_;
  2343. }
  2344. void AllocateShadowZones()
  2345. {
  2346. if (ShadowZonesAllocated_) {
  2347. return;
  2348. }
  2349. if (Kind_ == EAllocationKind::Tagged) {
  2350. MemoryTagZoneStart_ = MapShadowZone<TMemoryTag>();
  2351. }
  2352. #ifdef YTALLOC_NERVOUS
  2353. ChunkStateZoneStart_ = MapShadowZone<ESmallChunkState>();
  2354. #endif
  2355. ShadowZonesAllocated_ = true;
  2356. }
  2357. template <class T>
  2358. T* MapShadowZone()
  2359. {
  2360. auto size = AlignUp((SmallZoneSize >> LogSize_) * sizeof (T), PageSize);
  2361. auto* ptr = static_cast<T*>(MappedMemoryManager->Map(SystemZoneStart, size, MAP_NORESERVE));
  2362. MappedMemoryManager->DontDump(ptr, size);
  2363. return ptr;
  2364. }
  2365. };
  2366. TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, std::array<TExplicitlyConstructableSingleton<TSmallArenaAllocator>, SmallRankCount>>> SmallArenaAllocators;
  2367. ////////////////////////////////////////////////////////////////////////////////
  2368. constexpr size_t ChunksPerGroup = 128;
  2369. constexpr size_t GroupsBatchSize = 1024;
  2370. static_assert(ChunksPerGroup <= MaxCachedChunksPerRank, "ChunksPerGroup > MaxCachedChunksPerRank");
  2371. class TChunkGroup
  2372. : public TFreeListItemBase<TChunkGroup>
  2373. {
  2374. public:
  2375. bool IsEmpty() const
  2376. {
  2377. return Size_ == 0;
  2378. }
  2379. size_t ExtractAll(void** ptrs)
  2380. {
  2381. auto count = Size_;
  2382. ::memcpy(ptrs, Ptrs_.data(), count * sizeof(void*));
  2383. Size_ = 0;
  2384. return count;
  2385. }
  2386. void PutOne(void* ptr)
  2387. {
  2388. PutMany(&ptr, 1);
  2389. }
  2390. void PutMany(void** ptrs, size_t count)
  2391. {
  2392. YTALLOC_PARANOID_ASSERT(Size_ == 0);
  2393. YTALLOC_PARANOID_ASSERT(count <= ChunksPerGroup);
  2394. ::memcpy(Ptrs_.data(), ptrs, count * sizeof(void*));
  2395. Size_ = count;
  2396. }
  2397. private:
  2398. size_t Size_ = 0; // <= ChunksPerGroup
  2399. std::array<void*, ChunksPerGroup> Ptrs_;
  2400. };
  2401. class TGlobalSmallChunkCache
  2402. {
  2403. public:
  2404. explicit TGlobalSmallChunkCache(EAllocationKind kind)
  2405. : Kind_(kind)
  2406. { }
  2407. #ifdef YTALLOC_PARANOID
  2408. void CanonizeChunkPtrs(TThreadState* state, size_t rank)
  2409. {
  2410. auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
  2411. auto leftBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
  2412. auto rightBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
  2413. state->SmallBlobCache[Kind_].CachedChunkFull[rank] = false;
  2414. if (chunkPtrPtr + 1 == rightBorder) {
  2415. chunkPtrPtr = leftBorder;
  2416. state->SmallBlobCache[Kind_].CachedChunkFull[rank] = true;
  2417. }
  2418. state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = leftBorder;
  2419. }
  2420. #endif
  2421. bool TryMoveGroupToLocal(TThreadState* state, size_t rank)
  2422. {
  2423. auto& groups = RankToChunkGroups_[rank];
  2424. auto* group = groups.Extract(state);
  2425. if (!Y_LIKELY(group)) {
  2426. return false;
  2427. }
  2428. YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
  2429. auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
  2430. #ifdef YTALLOC_PARANOID
  2431. chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
  2432. state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = chunkPtrPtr;
  2433. #endif
  2434. auto chunkCount = group->ExtractAll(chunkPtrPtr + 1);
  2435. chunkPtrPtr += chunkCount;
  2436. #ifdef YTALLOC_PARANOID
  2437. CanonizeChunkPtrs(state, rank);
  2438. #endif
  2439. GroupPool_.Free(state, group);
  2440. return true;
  2441. }
  2442. void MoveGroupToGlobal(TThreadState* state, size_t rank)
  2443. {
  2444. auto* group = GroupPool_.Allocate(state);
  2445. auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
  2446. YTALLOC_PARANOID_ASSERT(*(chunkPtrPtr + 1) == reinterpret_cast<void*>(TThreadState::RightSentinel));
  2447. group->PutMany(chunkPtrPtr - ChunksPerGroup + 1, ChunksPerGroup);
  2448. chunkPtrPtr -= ChunksPerGroup;
  2449. #ifdef YTALLOC_PARANOID
  2450. ::memset(chunkPtrPtr + 1, 0, sizeof(void*) * ChunksPerGroup);
  2451. CanonizeChunkPtrs(state, rank);
  2452. #endif
  2453. auto& groups = RankToChunkGroups_[rank];
  2454. YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
  2455. groups.Put(state, group);
  2456. }
  2457. void MoveOneToGlobal(void* ptr, size_t rank)
  2458. {
  2459. auto* group = GroupPool_.Allocate(&GlobalShardedState_);
  2460. group->PutOne(ptr);
  2461. auto& groups = RankToChunkGroups_[rank];
  2462. YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
  2463. groups.Put(&GlobalShardedState_, group);
  2464. }
  2465. #ifdef YTALLOC_PARANOID
  2466. void MoveAllToGlobal(TThreadState* state, size_t rank)
  2467. {
  2468. auto leftSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
  2469. auto rightSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
  2470. auto& headPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
  2471. auto& tailPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank];
  2472. if (tailPtr == headPtr && !state->SmallBlobCache[Kind_].CachedChunkFull[rank]) {
  2473. headPtr = leftSentinelBorder;
  2474. return;
  2475. }
  2476. // (leftBorder, rightBorder]
  2477. auto moveIntervalToGlobal = [=] (void** leftBorder, void** rightBorder) {
  2478. while (true) {
  2479. size_t count = 0;
  2480. while (count < ChunksPerGroup && rightBorder != leftBorder) {
  2481. --rightBorder;
  2482. ++count;
  2483. }
  2484. if (count == 0) {
  2485. break;
  2486. }
  2487. auto* group = GroupPool_.Allocate(state);
  2488. group->PutMany(rightBorder + 1, count);
  2489. ::memset(rightBorder + 1, 0, sizeof(void*) * count);
  2490. auto& groups = RankToChunkGroups_[rank];
  2491. groups.Put(state, group);
  2492. }
  2493. };
  2494. if (tailPtr >= headPtr) {
  2495. moveIntervalToGlobal(tailPtr, rightSentinelBorder - 1);
  2496. moveIntervalToGlobal(leftSentinelBorder, headPtr);
  2497. } else {
  2498. moveIntervalToGlobal(tailPtr, headPtr);
  2499. }
  2500. headPtr = leftSentinelBorder;
  2501. }
  2502. #else
  2503. void MoveAllToGlobal(TThreadState* state, size_t rank)
  2504. {
  2505. auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
  2506. while (true) {
  2507. size_t count = 0;
  2508. while (count < ChunksPerGroup && *chunkPtrPtr != reinterpret_cast<void*>(TThreadState::LeftSentinel)) {
  2509. --chunkPtrPtr;
  2510. ++count;
  2511. }
  2512. if (count == 0) {
  2513. break;
  2514. }
  2515. auto* group = GroupPool_.Allocate(state);
  2516. group->PutMany(chunkPtrPtr + 1, count);
  2517. auto& groups = RankToChunkGroups_[rank];
  2518. groups.Put(state, group);
  2519. }
  2520. }
  2521. #endif
  2522. private:
  2523. const EAllocationKind Kind_;
  2524. TGlobalShardedState GlobalShardedState_;
  2525. TShardedSystemPool<TChunkGroup, GroupsBatchSize> GroupPool_;
  2526. std::array<TShardedFreeList<TChunkGroup>, SmallRankCount> RankToChunkGroups_;
  2527. };
  2528. TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, TExplicitlyConstructableSingleton<TGlobalSmallChunkCache>>> GlobalSmallChunkCaches;
  2529. ////////////////////////////////////////////////////////////////////////////////
  2530. class TSmallAllocator
  2531. {
  2532. public:
  2533. template <EAllocationKind Kind>
  2534. static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank)
  2535. {
  2536. auto* state = TThreadManager::FindThreadState();
  2537. if (Y_LIKELY(state)) {
  2538. return Allocate<Kind>(tag, rank, state);
  2539. }
  2540. auto size = SmallRankToSize[rank];
  2541. return AllocateGlobal<Kind>(tag, rank, size);
  2542. }
  2543. #ifdef YTALLOC_PARANOID
  2544. template <EAllocationKind Kind>
  2545. static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
  2546. {
  2547. auto& localCache = state->SmallBlobCache[Kind];
  2548. auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
  2549. size_t size = SmallRankToSize[rank];
  2550. StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
  2551. auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
  2552. auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
  2553. void* result;
  2554. while (true) {
  2555. auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
  2556. auto& cachedHeadPtr = *(chunkHeadPtr + 1);
  2557. auto* headPtr = cachedHeadPtr;
  2558. auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
  2559. auto& cachedTailPtr = *(chunkTailPtr + 1);
  2560. auto* tailPtr = cachedTailPtr;
  2561. auto& chunkFull = localCache.CachedChunkFull[rank];
  2562. if (Y_LIKELY(chunkFull || headPtr != tailPtr)) {
  2563. YTALLOC_PARANOID_ASSERT(tailPtr);
  2564. cachedTailPtr = nullptr;
  2565. ++chunkTailPtr;
  2566. if (Y_LIKELY(chunkTailPtr + 1 == rightBorder)) {
  2567. chunkTailPtr = leftBorder;
  2568. }
  2569. chunkFull = false;
  2570. result = tailPtr;
  2571. PoisonUninitializedRange(result, size);
  2572. allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
  2573. break;
  2574. }
  2575. auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
  2576. if (!globalCache.TryMoveGroupToLocal(state, rank)) {
  2577. result = allocator.Allocate(size);
  2578. break;
  2579. }
  2580. }
  2581. if constexpr(Kind == EAllocationKind::Tagged) {
  2582. allocator.SetMemoryTag(result, tag);
  2583. }
  2584. return result;
  2585. }
  2586. template <EAllocationKind Kind>
  2587. static Y_FORCE_INLINE void Free(void* ptr)
  2588. {
  2589. auto rank = PtrToSmallRank(ptr);
  2590. auto size = SmallRankToSize[rank];
  2591. auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
  2592. auto tag = NullMemoryTag;
  2593. if constexpr(Kind == EAllocationKind::Tagged) {
  2594. tag = allocator.GetAndResetMemoryTag(ptr);
  2595. YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
  2596. }
  2597. allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
  2598. PoisonFreedRange(ptr, size);
  2599. auto* state = TThreadManager::FindThreadState();
  2600. if (Y_UNLIKELY(!state)) {
  2601. FreeGlobal<Kind>(tag, ptr, rank, size);
  2602. return;
  2603. }
  2604. StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
  2605. auto& localCache = state->SmallBlobCache[Kind];
  2606. auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
  2607. auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
  2608. while (true) {
  2609. auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
  2610. auto& headPtr = *(chunkHeadPtr + 1);
  2611. auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
  2612. auto& chunkFull = localCache.CachedChunkFull[rank];
  2613. if (Y_LIKELY(!chunkFull)) {
  2614. headPtr = ptr;
  2615. ++chunkHeadPtr;
  2616. if (Y_LIKELY(chunkHeadPtr + 1 == rightBorder)) {
  2617. chunkHeadPtr = leftBorder;
  2618. }
  2619. chunkFull = (chunkHeadPtr == chunkTailPtr);
  2620. break;
  2621. }
  2622. chunkHeadPtr = rightBorder - 1;
  2623. chunkTailPtr = leftBorder;
  2624. auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
  2625. globalCache.MoveGroupToGlobal(state, rank);
  2626. }
  2627. }
  2628. #else
  2629. template <EAllocationKind Kind>
  2630. static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
  2631. {
  2632. size_t size = SmallRankToSize[rank];
  2633. StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
  2634. auto& localCache = state->SmallBlobCache[Kind];
  2635. auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
  2636. void* result;
  2637. while (true) {
  2638. auto& chunkPtr = localCache.RankToCachedChunkPtrHead[rank];
  2639. auto& cachedPtr = *chunkPtr;
  2640. auto* ptr = cachedPtr;
  2641. if (Y_LIKELY(ptr != reinterpret_cast<void*>(TThreadState::LeftSentinel))) {
  2642. --chunkPtr;
  2643. result = ptr;
  2644. allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
  2645. PoisonUninitializedRange(result, size);
  2646. break;
  2647. }
  2648. auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
  2649. if (globalCache.TryMoveGroupToLocal(state, rank)) {
  2650. continue;
  2651. }
  2652. auto count = allocator.PullMany(
  2653. chunkPtr + 1,
  2654. SmallRankBatchSize[rank]);
  2655. chunkPtr += count;
  2656. }
  2657. if constexpr(Kind == EAllocationKind::Tagged) {
  2658. allocator.SetMemoryTag(result, tag);
  2659. }
  2660. return result;
  2661. }
  2662. template <EAllocationKind Kind>
  2663. static Y_FORCE_INLINE void Free(void* ptr)
  2664. {
  2665. auto rank = PtrToSmallRank(ptr);
  2666. auto size = SmallRankToSize[rank];
  2667. auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
  2668. auto tag = NullMemoryTag;
  2669. if constexpr(Kind == EAllocationKind::Tagged) {
  2670. tag = allocator.GetAndResetMemoryTag(ptr);
  2671. YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
  2672. }
  2673. allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
  2674. PoisonFreedRange(ptr, size);
  2675. auto* state = TThreadManager::FindThreadState();
  2676. if (Y_UNLIKELY(!state)) {
  2677. FreeGlobal<Kind>(tag, ptr, rank, size);
  2678. return;
  2679. }
  2680. StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
  2681. auto& localCache = state->SmallBlobCache[Kind];
  2682. while (true) {
  2683. auto& chunkPtrPtr = localCache.RankToCachedChunkPtrHead[rank];
  2684. auto& chunkPtr = *(chunkPtrPtr + 1);
  2685. if (Y_LIKELY(chunkPtr != reinterpret_cast<void*>(TThreadState::RightSentinel))) {
  2686. chunkPtr = ptr;
  2687. ++chunkPtrPtr;
  2688. break;
  2689. }
  2690. auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
  2691. globalCache.MoveGroupToGlobal(state, rank);
  2692. }
  2693. }
  2694. #endif
  2695. static size_t GetAllocationSize(const void* ptr)
  2696. {
  2697. return SmallRankToSize[PtrToSmallRank(ptr)];
  2698. }
  2699. static size_t GetAllocationSize(size_t size)
  2700. {
  2701. return SmallRankToSize[SizeToSmallRank(size)];
  2702. }
  2703. static void PurgeCaches()
  2704. {
  2705. DoPurgeCaches<EAllocationKind::Untagged>();
  2706. DoPurgeCaches<EAllocationKind::Tagged>();
  2707. }
  2708. private:
  2709. template <EAllocationKind Kind>
  2710. static void DoPurgeCaches()
  2711. {
  2712. auto* state = TThreadManager::GetThreadStateChecked();
  2713. for (size_t rank = 0; rank < SmallRankCount; ++rank) {
  2714. (*GlobalSmallChunkCaches)[Kind]->MoveAllToGlobal(state, rank);
  2715. }
  2716. }
  2717. template <EAllocationKind Kind>
  2718. static void* AllocateGlobal(TMemoryTag tag, size_t rank, size_t size)
  2719. {
  2720. StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
  2721. auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
  2722. auto* result = allocator.Allocate(size);
  2723. if constexpr(Kind == EAllocationKind::Tagged) {
  2724. allocator.SetMemoryTag(result, tag);
  2725. }
  2726. return result;
  2727. }
  2728. template <EAllocationKind Kind>
  2729. static void FreeGlobal(TMemoryTag tag, void* ptr, size_t rank, size_t size)
  2730. {
  2731. StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
  2732. auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
  2733. globalCache.MoveOneToGlobal(ptr, rank);
  2734. }
  2735. };
  2736. ////////////////////////////////////////////////////////////////////////////////
  2737. // Large blob allocator
  2738. //
  2739. // Like for small chunks, large blobs are grouped into arenas, where arena K handles
  2740. // blobs of size (2^{K-1},2^K]. Memory is mapped in extents of LargeExtentSize bytes.
  2741. // Each extent is split into segments of size 2^K (here segment is just a memory region, which may fully consist of
  2742. // unmapped pages). When a segment is actually allocated, it becomes a blob and a TLargeBlobHeader
  2743. // structure is placed at its start.
  2744. //
  2745. // When an extent is allocated, it is sliced into segments (not blobs, since no headers are placed and
  2746. // no memory is touched). These segments are put into disposed segments list.
  2747. //
  2748. // For each blob two separate sizes are maintained: BytesAcquired indicates the number of bytes
  2749. // acquired via madvise(MADV_POPULATE) from the system; BytesAllocated (<= BytesAcquired) corresponds
  2750. // to the number of bytes claimed by the user (including the header and page size alignment).
  2751. // If BytesAllocated == 0 then this blob is spare, i.e.
  2752. // was freed and remains cached for further possible reuse.
  2753. //
  2754. // When a new blob is being allocated, the allocator first tries to extract a spare blob. On success,
  2755. // its acquired size is extended (if needed); the acquired size never shrinks on allocation.
  2756. // If no spare blobs exist, a disposed segment is extracted and is turned into a blob (i.e.
  2757. // its header is initialized) and the needed number of bytes is acquired. If no disposed segments
  2758. // exist, then a new extent is allocated and sliced into segments.
  2759. //
  2760. // The above algorithm only claims memory from the system (by means of madvise(MADV_POPULATE));
  2761. // the reclaim is handled by a separate background mechanism. Two types of reclaimable memory
  2762. // regions are possible:
  2763. // * spare: these correspond to spare blobs; upon reclaiming this region becomes a disposed segment
  2764. // * overhead: these correspond to trailing parts of allocated blobs in [BytesAllocated, BytesAcquired) byte range
  2765. //
  2766. // Reclaiming spare blobs is easy as these are explicitly tracked by spare blob lists. To reclaim,
  2767. // we atomically extract a blob from a spare list, call madvise(MADV_FREE), and put the pointer to
  2768. // the disposed segment list.
  2769. //
  2770. // Reclaiming overheads is more complicated since (a) allocated blobs are never tracked directly and
  2771. // (b) reclaiming them may interfere with Allocate and Free.
  2772. //
  2773. // To overcome (a), for each extent we maintain a bitmap marking segments that are actually blobs
  2774. // (i.e. contain a header). (For simplicity and efficiency this bitmap is just a vector of bytes.)
  2775. // These flags are updated in Allocate/Free with appropriate memory ordering. Note that
  2776. // blobs are only disposed (and are turned into segments) by the background thread; if this
  2777. // thread discovers a segment that is marked as a blob, then it is safe to assume that this segment
  2778. // remains a blob unless the thread disposes it.
  2779. //
  2780. // To overcome (b), each large blob header maintains a spin lock. When blob B is extracted
  2781. // from a spare list in Allocate, an acquisition is tried. If successful, B is returned to the
  2782. // user. Otherwise it is assumed that B is currently being examined by the background
  2783. // reclaimer thread. Allocate then skips this blob and retries extraction; the problem is that
  2784. // since the spare list is basically a stack one cannot just push B back into the spare list.
  2785. // Instead, B is pushed into a special locked spare list. This list is purged by the background
  2786. // thread on each tick and its items are pushed back into the usual spare list.
  2787. //
  2788. // A similar trick is used by Free: when invoked for blob B its spin lock acquisition is first
  2789. // tried. Upon success, B is moved to the spare list. On failure, Free has to postpone this deallocation
  2790. // by moving B into the freed locked list. This list, similarly, is being purged by the background thread.
  2791. //
  2792. // It remains to explain how the background thread computes the number of bytes to be reclaimed from
  2793. // each arena. To this aim, we first compute the total number of reclaimable bytes.
  2794. // This is the sum of spare and overhead bytes in all arenas minus the number of unreclaimable bytes
  2795. // The latter grows linearly in the number of used bytes and is capped from below by a MinUnreclaimableLargeBytes;
  2796. // and from above by MaxUnreclaimableLargeBytes. SetLargeUnreclaimableCoeff and Set(Min|Max)LargeUnreclaimableBytes
  2797. // enable tuning these control knobs. The reclaimable bytes are being taken from arenas starting from those
  2798. // with the largest spare and overhead volumes.
  2799. //
  2800. // The above implies that each large blob contains a fixed-size header preceeding it.
  2801. // Hence ptr % PageSize == sizeof (TLargeBlobHeader) for each ptr returned by Allocate
  2802. // (since large blob sizes are larger than PageSize and are divisible by PageSize).
  2803. // For AllocatePageAligned, however, ptr must be divisible by PageSize. To handle such an allocation, we
  2804. // artificially increase its size and align the result of Allocate up to the next page boundary.
  2805. // When handling a deallocation, ptr is moved back by UnalignPtr (which is capable of dealing
  2806. // with both the results of Allocate and AllocatePageAligned).
  2807. // This technique applies to both large and huge blobs.
  2808. enum ELargeBlobState : ui64
  2809. {
  2810. Allocated = 0x6c6c61656772616cULL, // largeall
  2811. Spare = 0x727073656772616cULL, // largespr
  2812. LockedSpare = 0x70736c656772616cULL, // largelsp
  2813. LockedFreed = 0x72666c656772616cULL // largelfr
  2814. };
  2815. // Every large blob (either tagged or not) is prepended with this header.
  2816. struct TLargeBlobHeader
  2817. : public TFreeListItemBase<TLargeBlobHeader>
  2818. {
  2819. TLargeBlobHeader(
  2820. TLargeBlobExtent* extent,
  2821. size_t bytesAcquired,
  2822. size_t bytesAllocated,
  2823. TMemoryTag tag)
  2824. : Extent(extent)
  2825. , BytesAcquired(bytesAcquired)
  2826. , Tag(tag)
  2827. , BytesAllocated(bytesAllocated)
  2828. , State(ELargeBlobState::Allocated)
  2829. { }
  2830. TLargeBlobExtent* Extent;
  2831. // Number of bytes in all acquired pages.
  2832. size_t BytesAcquired;
  2833. std::atomic<bool> Locked = false;
  2834. TMemoryTag Tag = NullMemoryTag;
  2835. // For spare blobs this is zero.
  2836. // For allocated blobs this is the number of bytes requested by user (not including header of any alignment).
  2837. size_t BytesAllocated;
  2838. ELargeBlobState State;
  2839. char Padding[12];
  2840. };
  2841. CHECK_HEADER_ALIGNMENT(TLargeBlobHeader)
  2842. struct TLargeBlobExtent
  2843. {
  2844. TLargeBlobExtent(size_t segmentCount, char* ptr)
  2845. : SegmentCount(segmentCount)
  2846. , Ptr(ptr)
  2847. { }
  2848. size_t SegmentCount;
  2849. char* Ptr;
  2850. TLargeBlobExtent* NextExtent = nullptr;
  2851. std::atomic<bool> DisposedFlags[0];
  2852. };
  2853. // A helper node that enables storing a number of extent's segments
  2854. // in a free list. Recall that segments themselves do not posses any headers.
  2855. struct TDisposedSegment
  2856. : public TFreeListItemBase<TDisposedSegment>
  2857. {
  2858. size_t Index;
  2859. TLargeBlobExtent* Extent;
  2860. };
  2861. struct TLargeArena
  2862. {
  2863. size_t Rank = 0;
  2864. size_t SegmentSize = 0;
  2865. TShardedFreeList<TLargeBlobHeader> SpareBlobs;
  2866. TFreeList<TLargeBlobHeader> LockedSpareBlobs;
  2867. TFreeList<TLargeBlobHeader> LockedFreedBlobs;
  2868. TFreeList<TDisposedSegment> DisposedSegments;
  2869. std::atomic<TLargeBlobExtent*> FirstExtent = nullptr;
  2870. TLargeBlobExtent* CurrentOverheadScanExtent = nullptr;
  2871. size_t CurrentOverheadScanSegment = 0;
  2872. };
  2873. template <bool Dumpable>
  2874. class TLargeBlobAllocator
  2875. {
  2876. public:
  2877. TLargeBlobAllocator()
  2878. : ZoneAllocator_(LargeZoneStart(Dumpable), LargeZoneEnd(Dumpable))
  2879. {
  2880. for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
  2881. auto& arena = Arenas_[rank];
  2882. arena.Rank = rank;
  2883. arena.SegmentSize = (1ULL << rank);
  2884. }
  2885. }
  2886. void* Allocate(size_t size)
  2887. {
  2888. auto* state = TThreadManager::FindThreadState();
  2889. return Y_LIKELY(state)
  2890. ? DoAllocate(state, size)
  2891. : DoAllocate(GlobalState.Get(), size);
  2892. }
  2893. void Free(void* ptr)
  2894. {
  2895. auto* state = TThreadManager::FindThreadState();
  2896. if (Y_LIKELY(state)) {
  2897. DoFree(state, ptr);
  2898. } else {
  2899. DoFree(GlobalState.Get(), ptr);
  2900. }
  2901. }
  2902. static size_t GetAllocationSize(const void* ptr)
  2903. {
  2904. UnalignPtr<TLargeBlobHeader>(ptr);
  2905. const auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
  2906. return blob->BytesAllocated;
  2907. }
  2908. static size_t GetAllocationSize(size_t size)
  2909. {
  2910. return GetBlobAllocationSize<TLargeBlobHeader>(size);
  2911. }
  2912. void RunBackgroundTasks()
  2913. {
  2914. ReinstallLockedBlobs();
  2915. ReclaimMemory();
  2916. }
  2917. void SetBacktraceProvider(TBacktraceProvider provider)
  2918. {
  2919. BacktraceProvider_.store(provider);
  2920. }
  2921. private:
  2922. template <class TState>
  2923. void PopulateArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
  2924. {
  2925. MappedMemoryManager->Populate(ptr, size);
  2926. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesPopulated, size);
  2927. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesPopulated, size / PageSize);
  2928. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, size);
  2929. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, size / PageSize);
  2930. }
  2931. template <class TState>
  2932. void ReleaseArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
  2933. {
  2934. MappedMemoryManager->Release(ptr, size);
  2935. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesReleased, size);
  2936. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesReleased, size / PageSize);
  2937. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, -size);
  2938. StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, -size / PageSize);
  2939. }
  2940. bool TryLockBlob(TLargeBlobHeader* blob)
  2941. {
  2942. bool expected = false;
  2943. return blob->Locked.compare_exchange_strong(expected, true);
  2944. }
  2945. void UnlockBlob(TLargeBlobHeader* blob)
  2946. {
  2947. blob->Locked.store(false);
  2948. }
  2949. template <class TState>
  2950. void MoveBlobToSpare(TState* state, TLargeArena* arena, TLargeBlobHeader* blob, bool unlock)
  2951. {
  2952. auto rank = arena->Rank;
  2953. auto size = blob->BytesAllocated;
  2954. auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
  2955. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, blob->BytesAcquired);
  2956. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -(blob->BytesAcquired - rawSize));
  2957. blob->BytesAllocated = 0;
  2958. if (unlock) {
  2959. UnlockBlob(blob);
  2960. } else {
  2961. YTALLOC_VERIFY(!blob->Locked.load());
  2962. }
  2963. blob->State = ELargeBlobState::Spare;
  2964. arena->SpareBlobs.Put(state, blob);
  2965. }
  2966. size_t GetBytesToReclaim(const std::array<TLocalLargeCounters, LargeRankCount>& arenaCounters)
  2967. {
  2968. size_t totalBytesAllocated = 0;
  2969. size_t totalBytesFreed = 0;
  2970. size_t totalBytesSpare = 0;
  2971. size_t totalBytesOverhead = 0;
  2972. for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
  2973. const auto& counters = arenaCounters[rank];
  2974. totalBytesAllocated += counters[ELargeArenaCounter::BytesAllocated];
  2975. totalBytesFreed += counters[ELargeArenaCounter::BytesFreed];
  2976. totalBytesSpare += counters[ELargeArenaCounter::BytesSpare];
  2977. totalBytesOverhead += counters[ELargeArenaCounter::BytesOverhead];
  2978. }
  2979. auto totalBytesUsed = totalBytesAllocated - totalBytesFreed;
  2980. auto totalBytesReclaimable = totalBytesSpare + totalBytesOverhead;
  2981. auto threshold = ClampVal(
  2982. static_cast<size_t>(ConfigurationManager->GetLargeUnreclaimableCoeff() * totalBytesUsed),
  2983. ConfigurationManager->GetMinLargeUnreclaimableBytes(),
  2984. ConfigurationManager->GetMaxLargeUnreclaimableBytes());
  2985. if (totalBytesReclaimable < threshold) {
  2986. return 0;
  2987. }
  2988. auto bytesToReclaim = totalBytesReclaimable - threshold;
  2989. return AlignUp(bytesToReclaim, PageSize);
  2990. }
  2991. void ReinstallLockedSpareBlobs(TLargeArena* arena)
  2992. {
  2993. auto* blob = arena->LockedSpareBlobs.ExtractAll();
  2994. auto* state = TThreadManager::GetThreadStateChecked();
  2995. size_t count = 0;
  2996. while (blob) {
  2997. auto* nextBlob = blob->Next.load();
  2998. YTALLOC_VERIFY(!blob->Locked.load());
  2999. AssertBlobState(blob, ELargeBlobState::LockedSpare);
  3000. blob->State = ELargeBlobState::Spare;
  3001. arena->SpareBlobs.Put(state, blob);
  3002. blob = nextBlob;
  3003. ++count;
  3004. }
  3005. if (count > 0) {
  3006. YTALLOC_LOG_DEBUG("Locked spare blobs reinstalled (Rank: %d, Blobs: %zu)",
  3007. arena->Rank,
  3008. count);
  3009. }
  3010. }
  3011. void ReinstallLockedFreedBlobs(TLargeArena* arena)
  3012. {
  3013. auto* state = TThreadManager::GetThreadStateChecked();
  3014. auto* blob = arena->LockedFreedBlobs.ExtractAll();
  3015. size_t count = 0;
  3016. while (blob) {
  3017. auto* nextBlob = blob->Next.load();
  3018. AssertBlobState(blob, ELargeBlobState::LockedFreed);
  3019. MoveBlobToSpare(state, arena, blob, false);
  3020. ++count;
  3021. blob = nextBlob;
  3022. }
  3023. if (count > 0) {
  3024. YTALLOC_LOG_DEBUG("Locked freed blobs reinstalled (Rank: %d, Blobs: %zu)",
  3025. arena->Rank,
  3026. count);
  3027. }
  3028. }
  3029. void ReclaimSpareMemory(TLargeArena* arena, ssize_t bytesToReclaim)
  3030. {
  3031. if (bytesToReclaim <= 0) {
  3032. return;
  3033. }
  3034. auto rank = arena->Rank;
  3035. auto* state = TThreadManager::GetThreadStateChecked();
  3036. YTALLOC_LOG_DEBUG("Started processing spare memory in arena (BytesToReclaim: %zdM, Rank: %d)",
  3037. bytesToReclaim / 1_MB,
  3038. rank);
  3039. size_t bytesReclaimed = 0;
  3040. size_t blobsReclaimed = 0;
  3041. while (bytesToReclaim > 0) {
  3042. auto* blob = arena->SpareBlobs.ExtractRoundRobin(state);
  3043. if (!blob) {
  3044. break;
  3045. }
  3046. AssertBlobState(blob, ELargeBlobState::Spare);
  3047. YTALLOC_VERIFY(blob->BytesAllocated == 0);
  3048. auto bytesAcquired = blob->BytesAcquired;
  3049. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -bytesAcquired);
  3050. bytesToReclaim -= bytesAcquired;
  3051. bytesReclaimed += bytesAcquired;
  3052. blobsReclaimed += 1;
  3053. auto* extent = blob->Extent;
  3054. auto* ptr = reinterpret_cast<char*>(blob);
  3055. ReleaseArenaPages(
  3056. state,
  3057. arena,
  3058. ptr,
  3059. bytesAcquired);
  3060. size_t segmentIndex = (ptr - extent->Ptr) / arena->SegmentSize;
  3061. extent->DisposedFlags[segmentIndex].store(true, std::memory_order_relaxed);
  3062. auto* disposedSegment = DisposedSegmentPool_.Allocate();
  3063. disposedSegment->Index = segmentIndex;
  3064. disposedSegment->Extent = extent;
  3065. arena->DisposedSegments.Put(disposedSegment);
  3066. }
  3067. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::SpareBytesReclaimed, bytesReclaimed);
  3068. YTALLOC_LOG_DEBUG("Finished processing spare memory in arena (Rank: %d, BytesReclaimed: %zdM, BlobsReclaimed: %zu)",
  3069. arena->Rank,
  3070. bytesReclaimed / 1_MB,
  3071. blobsReclaimed);
  3072. }
  3073. void ReclaimOverheadMemory(TLargeArena* arena, ssize_t bytesToReclaim)
  3074. {
  3075. if (bytesToReclaim == 0) {
  3076. return;
  3077. }
  3078. auto* state = TThreadManager::GetThreadStateChecked();
  3079. auto rank = arena->Rank;
  3080. YTALLOC_LOG_DEBUG("Started processing overhead memory in arena (BytesToReclaim: %zdM, Rank: %d)",
  3081. bytesToReclaim / 1_MB,
  3082. rank);
  3083. size_t extentsTraversed = 0;
  3084. size_t segmentsTraversed = 0;
  3085. size_t bytesReclaimed = 0;
  3086. bool restartedFromFirstExtent = false;
  3087. auto& currentExtent = arena->CurrentOverheadScanExtent;
  3088. auto& currentSegment = arena->CurrentOverheadScanSegment;
  3089. while (bytesToReclaim > 0) {
  3090. if (!currentExtent) {
  3091. if (restartedFromFirstExtent) {
  3092. break;
  3093. }
  3094. currentExtent = arena->FirstExtent.load();
  3095. if (!currentExtent) {
  3096. break;
  3097. }
  3098. restartedFromFirstExtent = true;
  3099. }
  3100. while (currentSegment < currentExtent->SegmentCount && bytesToReclaim > 0) {
  3101. ++segmentsTraversed;
  3102. if (!currentExtent->DisposedFlags[currentSegment].load(std::memory_order_acquire)) {
  3103. auto* ptr = currentExtent->Ptr + currentSegment * arena->SegmentSize;
  3104. auto* blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
  3105. YTALLOC_PARANOID_ASSERT(blob->Extent == currentExtent);
  3106. if (TryLockBlob(blob)) {
  3107. if (blob->BytesAllocated > 0) {
  3108. size_t rawSize = GetRawBlobSize<TLargeBlobHeader>(blob->BytesAllocated);
  3109. size_t bytesToRelease = blob->BytesAcquired - rawSize;
  3110. if (bytesToRelease > 0) {
  3111. ReleaseArenaPages(
  3112. state,
  3113. arena,
  3114. ptr + blob->BytesAcquired - bytesToRelease,
  3115. bytesToRelease);
  3116. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -bytesToRelease);
  3117. blob->BytesAcquired = rawSize;
  3118. bytesToReclaim -= bytesToRelease;
  3119. bytesReclaimed += bytesToRelease;
  3120. }
  3121. }
  3122. UnlockBlob(blob);
  3123. }
  3124. }
  3125. ++currentSegment;
  3126. }
  3127. ++extentsTraversed;
  3128. currentSegment = 0;
  3129. currentExtent = currentExtent->NextExtent;
  3130. }
  3131. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::OverheadBytesReclaimed, bytesReclaimed);
  3132. YTALLOC_LOG_DEBUG("Finished processing overhead memory in arena (Rank: %d, Extents: %zu, Segments: %zu, BytesReclaimed: %zuM)",
  3133. arena->Rank,
  3134. extentsTraversed,
  3135. segmentsTraversed,
  3136. bytesReclaimed / 1_MB);
  3137. }
  3138. void ReinstallLockedBlobs()
  3139. {
  3140. for (auto& arena : Arenas_) {
  3141. ReinstallLockedSpareBlobs(&arena);
  3142. ReinstallLockedFreedBlobs(&arena);
  3143. }
  3144. }
  3145. void ReclaimMemory()
  3146. {
  3147. auto arenaCounters = StatisticsManager->GetLargeArenaAllocationCounters();
  3148. ssize_t bytesToReclaim = GetBytesToReclaim(arenaCounters);
  3149. if (bytesToReclaim == 0) {
  3150. return;
  3151. }
  3152. YTALLOC_LOG_DEBUG("Memory reclaim started (BytesToReclaim: %zdM)",
  3153. bytesToReclaim / 1_MB);
  3154. std::array<ssize_t, LargeRankCount * 2> bytesReclaimablePerArena;
  3155. for (size_t rank = 0; rank < LargeRankCount; ++rank) {
  3156. bytesReclaimablePerArena[rank * 2] = arenaCounters[rank][ELargeArenaCounter::BytesOverhead];
  3157. bytesReclaimablePerArena[rank * 2 + 1] = arenaCounters[rank][ELargeArenaCounter::BytesSpare];
  3158. }
  3159. std::array<ssize_t, LargeRankCount * 2> bytesToReclaimPerArena{};
  3160. while (bytesToReclaim > 0) {
  3161. ssize_t maxBytes = std::numeric_limits<ssize_t>::min();
  3162. int maxIndex = -1;
  3163. for (int index = 0; index < LargeRankCount * 2; ++index) {
  3164. if (bytesReclaimablePerArena[index] > maxBytes) {
  3165. maxBytes = bytesReclaimablePerArena[index];
  3166. maxIndex = index;
  3167. }
  3168. }
  3169. if (maxIndex < 0) {
  3170. break;
  3171. }
  3172. auto bytesToReclaimPerStep = std::min<ssize_t>({bytesToReclaim, maxBytes, 4_MB});
  3173. if (bytesToReclaimPerStep < 0) {
  3174. break;
  3175. }
  3176. bytesToReclaimPerArena[maxIndex] += bytesToReclaimPerStep;
  3177. bytesReclaimablePerArena[maxIndex] -= bytesToReclaimPerStep;
  3178. bytesToReclaim -= bytesToReclaimPerStep;
  3179. }
  3180. for (auto& arena : Arenas_) {
  3181. auto rank = arena.Rank;
  3182. ReclaimOverheadMemory(&arena, bytesToReclaimPerArena[rank * 2]);
  3183. ReclaimSpareMemory(&arena, bytesToReclaimPerArena[rank * 2 + 1]);
  3184. }
  3185. YTALLOC_LOG_DEBUG("Memory reclaim finished");
  3186. }
  3187. template <class TState>
  3188. void AllocateArenaExtent(TState* state, TLargeArena* arena)
  3189. {
  3190. auto rank = arena->Rank;
  3191. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::ExtentsAllocated, 1);
  3192. size_t segmentCount = LargeExtentSize / arena->SegmentSize;
  3193. size_t extentHeaderSize = AlignUp(sizeof (TLargeBlobExtent) + sizeof (TLargeBlobExtent::DisposedFlags[0]) * segmentCount, PageSize);
  3194. size_t allocationSize = extentHeaderSize + LargeExtentSize;
  3195. auto* ptr = ZoneAllocator_.Allocate(allocationSize, MAP_NORESERVE);
  3196. if (!Dumpable) {
  3197. MappedMemoryManager->DontDump(ptr, allocationSize);
  3198. }
  3199. if (auto backtraceProvider = BacktraceProvider_.load()) {
  3200. std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
  3201. auto frameCount = backtraceProvider(
  3202. frames.data(),
  3203. MaxAllocationProfilingBacktraceDepth,
  3204. 3);
  3205. MmapObservationManager->EnqueueEvent(allocationSize, frames, frameCount);
  3206. }
  3207. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesMapped, allocationSize);
  3208. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesMapped, allocationSize / PageSize);
  3209. auto* extent = static_cast<TLargeBlobExtent*>(ptr);
  3210. MappedMemoryManager->Populate(ptr, extentHeaderSize);
  3211. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesPopulated, extentHeaderSize);
  3212. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesPopulated, extentHeaderSize / PageSize);
  3213. StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, extentHeaderSize);
  3214. new (extent) TLargeBlobExtent(segmentCount, static_cast<char*>(ptr) + extentHeaderSize);
  3215. for (size_t index = 0; index < segmentCount; ++index) {
  3216. auto* disposedSegment = DisposedSegmentPool_.Allocate();
  3217. disposedSegment->Index = index;
  3218. disposedSegment->Extent = extent;
  3219. arena->DisposedSegments.Put(disposedSegment);
  3220. extent->DisposedFlags[index].store(true);
  3221. }
  3222. auto* expectedFirstExtent = arena->FirstExtent.load();
  3223. do {
  3224. extent->NextExtent = expectedFirstExtent;
  3225. } while (Y_UNLIKELY(!arena->FirstExtent.compare_exchange_weak(expectedFirstExtent, extent)));
  3226. }
  3227. template <class TState>
  3228. void* DoAllocate(TState* state, size_t size)
  3229. {
  3230. auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
  3231. auto rank = GetLargeRank(rawSize);
  3232. auto tag = ConfigurationManager->IsLargeArenaAllocationProfiled(rank)
  3233. ? BacktraceManager->GetMemoryTagFromBacktrace(3)
  3234. : TThreadManager::GetCurrentMemoryTag();
  3235. auto& arena = Arenas_[rank];
  3236. YTALLOC_PARANOID_ASSERT(rawSize <= arena.SegmentSize);
  3237. TLargeBlobHeader* blob;
  3238. while (true) {
  3239. blob = arena.SpareBlobs.Extract(state);
  3240. if (blob) {
  3241. AssertBlobState(blob, ELargeBlobState::Spare);
  3242. if (TryLockBlob(blob)) {
  3243. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -blob->BytesAcquired);
  3244. if (blob->BytesAcquired < rawSize) {
  3245. PopulateArenaPages(
  3246. state,
  3247. &arena,
  3248. reinterpret_cast<char*>(blob) + blob->BytesAcquired,
  3249. rawSize - blob->BytesAcquired);
  3250. blob->BytesAcquired = rawSize;
  3251. } else {
  3252. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, blob->BytesAcquired - rawSize);
  3253. }
  3254. YTALLOC_PARANOID_ASSERT(blob->BytesAllocated == 0);
  3255. blob->BytesAllocated = size;
  3256. blob->Tag = tag;
  3257. blob->State = ELargeBlobState::Allocated;
  3258. UnlockBlob(blob);
  3259. break;
  3260. } else {
  3261. blob->State = ELargeBlobState::LockedSpare;
  3262. arena.LockedSpareBlobs.Put(blob);
  3263. }
  3264. }
  3265. auto* disposedSegment = arena.DisposedSegments.Extract();
  3266. if (disposedSegment) {
  3267. auto index = disposedSegment->Index;
  3268. auto* extent = disposedSegment->Extent;
  3269. DisposedSegmentPool_.Free(disposedSegment);
  3270. auto* ptr = extent->Ptr + index * arena.SegmentSize;
  3271. PopulateArenaPages(
  3272. state,
  3273. &arena,
  3274. ptr,
  3275. rawSize);
  3276. blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
  3277. new (blob) TLargeBlobHeader(extent, rawSize, size, tag);
  3278. extent->DisposedFlags[index].store(false, std::memory_order_release);
  3279. break;
  3280. }
  3281. AllocateArenaExtent(state, &arena);
  3282. }
  3283. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsAllocated, 1);
  3284. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesAllocated, size);
  3285. StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size);
  3286. if (!Dumpable) {
  3287. StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesAllocated, size);
  3288. }
  3289. auto* result = HeaderToPtr(blob);
  3290. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(Dumpable));
  3291. PoisonUninitializedRange(result, size);
  3292. return result;
  3293. }
  3294. template <class TState>
  3295. void DoFree(TState* state, void* ptr)
  3296. {
  3297. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(Dumpable));
  3298. auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
  3299. AssertBlobState(blob, ELargeBlobState::Allocated);
  3300. auto size = blob->BytesAllocated;
  3301. PoisonFreedRange(ptr, size);
  3302. auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
  3303. auto rank = GetLargeRank(rawSize);
  3304. auto& arena = Arenas_[rank];
  3305. YTALLOC_PARANOID_ASSERT(blob->BytesAcquired <= arena.SegmentSize);
  3306. auto tag = blob->Tag;
  3307. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsFreed, 1);
  3308. StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesFreed, size);
  3309. StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size);
  3310. if (!Dumpable) {
  3311. StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesFreed, size);
  3312. }
  3313. if (TryLockBlob(blob)) {
  3314. MoveBlobToSpare(state, &arena, blob, true);
  3315. } else {
  3316. blob->State = ELargeBlobState::LockedFreed;
  3317. arena.LockedFreedBlobs.Put(blob);
  3318. }
  3319. }
  3320. private:
  3321. TZoneAllocator ZoneAllocator_;
  3322. std::array<TLargeArena, LargeRankCount> Arenas_;
  3323. static constexpr size_t DisposedSegmentsBatchSize = 1024;
  3324. TSystemPool<TDisposedSegment, DisposedSegmentsBatchSize> DisposedSegmentPool_;
  3325. std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
  3326. };
  3327. TExplicitlyConstructableSingleton<TLargeBlobAllocator<true>> DumpableLargeBlobAllocator;
  3328. TExplicitlyConstructableSingleton<TLargeBlobAllocator<false>> UndumpableLargeBlobAllocator;
  3329. ////////////////////////////////////////////////////////////////////////////////
  3330. // Huge blob allocator
  3331. //
  3332. // Basically a wrapper for TZoneAllocator.
  3333. // Acts as a signature to detect broken headers.
  3334. enum class EHugeBlobState : ui64
  3335. {
  3336. Allocated = 0x72666c656772616cULL // hugeallc
  3337. };
  3338. // Every huge blob (both tagged or not) is prepended with this header.
  3339. struct THugeBlobHeader
  3340. {
  3341. THugeBlobHeader(TMemoryTag tag, size_t size, bool dumpable)
  3342. : Tag(tag)
  3343. , Size(size)
  3344. , State(EHugeBlobState::Allocated)
  3345. , Dumpable(dumpable)
  3346. { }
  3347. TMemoryTag Tag;
  3348. size_t Size;
  3349. EHugeBlobState State;
  3350. bool Dumpable;
  3351. char Padding[7];
  3352. };
  3353. CHECK_HEADER_ALIGNMENT(THugeBlobHeader)
  3354. class THugeBlobAllocator
  3355. {
  3356. public:
  3357. THugeBlobAllocator()
  3358. : ZoneAllocator_(HugeZoneStart, HugeZoneEnd)
  3359. { }
  3360. void* Allocate(size_t size, bool dumpable)
  3361. {
  3362. YTALLOC_VERIFY(size <= MaxAllocationSize);
  3363. auto tag = TThreadManager::GetCurrentMemoryTag();
  3364. auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
  3365. auto* blob = static_cast<THugeBlobHeader*>(ZoneAllocator_.Allocate(rawSize, MAP_POPULATE));
  3366. if (!dumpable) {
  3367. MappedMemoryManager->DontDump(blob, rawSize);
  3368. }
  3369. new (blob) THugeBlobHeader(tag, size, dumpable);
  3370. StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
  3371. StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsAllocated, 1);
  3372. StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesAllocated, size);
  3373. if (!dumpable) {
  3374. StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesAllocated, size);
  3375. }
  3376. auto* result = HeaderToPtr(blob);
  3377. PoisonUninitializedRange(result, size);
  3378. return result;
  3379. }
  3380. void Free(void* ptr)
  3381. {
  3382. auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
  3383. AssertBlobState(blob, EHugeBlobState::Allocated);
  3384. auto tag = blob->Tag;
  3385. auto size = blob->Size;
  3386. auto dumpable = blob->Dumpable;
  3387. PoisonFreedRange(ptr, size);
  3388. auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
  3389. ZoneAllocator_.Free(blob, rawSize);
  3390. StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
  3391. StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsFreed, 1);
  3392. StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesFreed, size);
  3393. if (!dumpable) {
  3394. StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesFreed, size);
  3395. }
  3396. }
  3397. static size_t GetAllocationSize(const void* ptr)
  3398. {
  3399. UnalignPtr<THugeBlobHeader>(ptr);
  3400. const auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
  3401. return blob->Size;
  3402. }
  3403. static size_t GetAllocationSize(size_t size)
  3404. {
  3405. return GetBlobAllocationSize<THugeBlobHeader>(size);
  3406. }
  3407. private:
  3408. TZoneAllocator ZoneAllocator_;
  3409. };
  3410. TExplicitlyConstructableSingleton<THugeBlobAllocator> HugeBlobAllocator;
  3411. ////////////////////////////////////////////////////////////////////////////////
  3412. // A thunk to large and huge blob allocators
  3413. class TBlobAllocator
  3414. {
  3415. public:
  3416. static void* Allocate(size_t size)
  3417. {
  3418. InitializeGlobals();
  3419. bool dumpable = GetCurrentMemoryZone() != EMemoryZone::Undumpable;
  3420. // NB: Account for the header. Also note that we may safely ignore the alignment since
  3421. // HugeAllocationSizeThreshold is already page-aligned.
  3422. if (Y_LIKELY(size < HugeAllocationSizeThreshold - sizeof(TLargeBlobHeader) - RightReadableAreaSize)) {
  3423. void* result = dumpable
  3424. ? DumpableLargeBlobAllocator->Allocate(size)
  3425. : UndumpableLargeBlobAllocator->Allocate(size);
  3426. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(dumpable));
  3427. return result;
  3428. } else {
  3429. auto* result = HugeBlobAllocator->Allocate(size, dumpable);
  3430. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= HugeZoneStart && reinterpret_cast<uintptr_t>(result) < HugeZoneEnd);
  3431. return result;
  3432. }
  3433. }
  3434. static void Free(void* ptr)
  3435. {
  3436. InitializeGlobals();
  3437. if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true)) {
  3438. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(true) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true));
  3439. UnalignPtr<TLargeBlobHeader>(ptr);
  3440. DumpableLargeBlobAllocator->Free(ptr);
  3441. } else if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false)) {
  3442. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(false) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false));
  3443. UnalignPtr<TLargeBlobHeader>(ptr);
  3444. UndumpableLargeBlobAllocator->Free(ptr);
  3445. } else if (reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd) {
  3446. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= HugeZoneStart && reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd);
  3447. UnalignPtr<THugeBlobHeader>(ptr);
  3448. HugeBlobAllocator->Free(ptr);
  3449. } else {
  3450. YTALLOC_TRAP("Wrong ptr passed to Free");
  3451. }
  3452. }
  3453. };
  3454. ////////////////////////////////////////////////////////////////////////////////
  3455. Y_POD_THREAD(bool) CurrentThreadIsBackground;
  3456. // Base class for all background threads.
  3457. template <class T>
  3458. class TBackgroundThreadBase
  3459. {
  3460. public:
  3461. TBackgroundThreadBase()
  3462. : State_(new TState())
  3463. {
  3464. NThreading::RegisterAtForkHandlers(
  3465. [=] { BeforeFork(); },
  3466. [=] { AfterForkParent(); },
  3467. [=] { AfterForkChild(); });
  3468. }
  3469. virtual ~TBackgroundThreadBase()
  3470. {
  3471. Stop();
  3472. }
  3473. private:
  3474. struct TState
  3475. : public TSystemAllocatable
  3476. {
  3477. std::mutex StartStopMutex;
  3478. std::optional<std::thread> Thread;
  3479. std::mutex StopFlagMutex;
  3480. std::condition_variable StopFlagVariable;
  3481. std::chrono::system_clock::time_point LastInvocationTime;
  3482. bool StopFlag = false;
  3483. bool Paused = false;
  3484. std::atomic<int> ForkDepth = 0;
  3485. bool RestartAfterFork = false;
  3486. };
  3487. TState* State_;
  3488. private:
  3489. void BeforeFork()
  3490. {
  3491. bool stopped = Stop();
  3492. if (State_->ForkDepth++ == 0) {
  3493. State_->RestartAfterFork = stopped;
  3494. }
  3495. }
  3496. void AfterForkParent()
  3497. {
  3498. if (--State_->ForkDepth == 0) {
  3499. if (State_->RestartAfterFork) {
  3500. Start(false);
  3501. }
  3502. }
  3503. }
  3504. void AfterForkChild()
  3505. {
  3506. bool restart = State_->RestartAfterFork;
  3507. State_ = new TState();
  3508. if (restart) {
  3509. Start(false);
  3510. }
  3511. }
  3512. virtual void ThreadMain() = 0;
  3513. protected:
  3514. void Start(bool fromAlloc)
  3515. {
  3516. std::unique_lock<std::mutex> guard(State_->StartStopMutex, std::defer_lock);
  3517. if (fromAlloc) {
  3518. if (!guard.try_lock()) {
  3519. return;
  3520. }
  3521. if (State_->Paused) {
  3522. return;
  3523. }
  3524. } else {
  3525. guard.lock();
  3526. }
  3527. State_->Paused = false;
  3528. if (State_->Thread) {
  3529. return;
  3530. }
  3531. State_->StopFlag = false;
  3532. State_->Thread.emplace([=] {
  3533. CurrentThreadIsBackground = true;
  3534. ThreadMain();
  3535. });
  3536. OnStart();
  3537. }
  3538. bool Stop()
  3539. {
  3540. std::unique_lock<std::mutex> guard(State_->StartStopMutex);
  3541. State_->Paused = true;
  3542. if (!State_->Thread) {
  3543. return false;
  3544. }
  3545. std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
  3546. State_->StopFlag = true;
  3547. flagGuard.unlock();
  3548. State_->StopFlagVariable.notify_one();
  3549. State_->Thread->join();
  3550. State_->Thread.reset();
  3551. OnStop();
  3552. return true;
  3553. }
  3554. bool IsDone(TDuration interval)
  3555. {
  3556. std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
  3557. auto result = State_->StopFlagVariable.wait_until(
  3558. flagGuard,
  3559. State_->LastInvocationTime + std::chrono::microseconds(interval.MicroSeconds()),
  3560. [&] { return State_->StopFlag; });
  3561. State_->LastInvocationTime = std::chrono::system_clock::now();
  3562. return result;
  3563. }
  3564. virtual void OnStart()
  3565. { }
  3566. virtual void OnStop()
  3567. { }
  3568. };
  3569. ////////////////////////////////////////////////////////////////////////////////
  3570. // Invokes madvise(MADV_STOCKPILE) periodically.
  3571. class TStockpileThread
  3572. : public TBackgroundThreadBase<TStockpileThread>
  3573. {
  3574. public:
  3575. explicit TStockpileThread(int index)
  3576. : Index_(index)
  3577. {
  3578. Start(false);
  3579. }
  3580. private:
  3581. const int Index_;
  3582. virtual void ThreadMain() override
  3583. {
  3584. TThread::SetCurrentThreadName(Sprintf("%s:%d", StockpileThreadName, Index_).c_str());
  3585. while (!IsDone(ConfigurationManager->GetStockpileInterval())) {
  3586. if (!MappedMemoryManager->Stockpile(ConfigurationManager->GetStockpileSize())) {
  3587. // No use to proceed.
  3588. YTALLOC_LOG_INFO("Stockpile call failed; terminating stockpile thread");
  3589. break;
  3590. }
  3591. }
  3592. }
  3593. };
  3594. // Manages a bunch of TStockpileThreads.
  3595. class TStockpileManager
  3596. {
  3597. public:
  3598. void SpawnIfNeeded()
  3599. {
  3600. if (!ConfigurationManager->IsStockpileEnabled()) {
  3601. return;
  3602. }
  3603. int threadCount = ConfigurationManager->GetStockpileThreadCount();
  3604. while (static_cast<int>(Threads_.size()) > threadCount) {
  3605. Threads_.pop_back();
  3606. }
  3607. while (static_cast<int>(Threads_.size()) < threadCount) {
  3608. Threads_.push_back(std::make_unique<TStockpileThread>(static_cast<int>(Threads_.size())));
  3609. }
  3610. }
  3611. private:
  3612. std::vector<std::unique_ptr<TStockpileThread>> Threads_;
  3613. };
  3614. TExplicitlyConstructableSingleton<TStockpileManager> StockpileManager;
  3615. ////////////////////////////////////////////////////////////////////////////////
  3616. // Time to wait before re-spawning the thread after a fork.
  3617. static constexpr auto BackgroundThreadRespawnDelay = TDuration::Seconds(3);
  3618. // Runs basic background activities: reclaim, logging, profiling etc.
  3619. class TBackgroundThread
  3620. : public TBackgroundThreadBase<TBackgroundThread>
  3621. {
  3622. public:
  3623. bool IsStarted()
  3624. {
  3625. return Started_.load();
  3626. }
  3627. void SpawnIfNeeded()
  3628. {
  3629. if (CurrentThreadIsBackground) {
  3630. return;
  3631. }
  3632. Start(true);
  3633. }
  3634. private:
  3635. std::atomic<bool> Started_ = false;
  3636. private:
  3637. virtual void ThreadMain() override
  3638. {
  3639. TThread::SetCurrentThreadName(BackgroundThreadName);
  3640. TimingManager->DisableForCurrentThread();
  3641. MmapObservationManager->DisableForCurrentThread();
  3642. while (!IsDone(BackgroundInterval)) {
  3643. DumpableLargeBlobAllocator->RunBackgroundTasks();
  3644. UndumpableLargeBlobAllocator->RunBackgroundTasks();
  3645. MappedMemoryManager->RunBackgroundTasks();
  3646. TimingManager->RunBackgroundTasks();
  3647. MmapObservationManager->RunBackgroundTasks();
  3648. StockpileManager->SpawnIfNeeded();
  3649. }
  3650. }
  3651. virtual void OnStart() override
  3652. {
  3653. DoUpdateAllThreadsControlWord(true);
  3654. }
  3655. virtual void OnStop() override
  3656. {
  3657. DoUpdateAllThreadsControlWord(false);
  3658. }
  3659. void DoUpdateAllThreadsControlWord(bool started)
  3660. {
  3661. // Update threads' TLS.
  3662. ThreadManager->EnumerateThreadStatesSync(
  3663. [&] {
  3664. Started_.store(started);
  3665. },
  3666. [&] (auto* state) {
  3667. if (state->BackgroundThreadStarted) {
  3668. *state->BackgroundThreadStarted = started;
  3669. }
  3670. });
  3671. }
  3672. };
  3673. TExplicitlyConstructableSingleton<TBackgroundThread> BackgroundThread;
  3674. ////////////////////////////////////////////////////////////////////////////////
  3675. Y_FORCE_INLINE TThreadState* TThreadManager::GetThreadStateUnchecked()
  3676. {
  3677. YTALLOC_PARANOID_ASSERT(ThreadState_);
  3678. return ThreadState_;
  3679. }
  3680. Y_FORCE_INLINE TThreadState* TThreadManager::FindThreadState()
  3681. {
  3682. if (Y_LIKELY(ThreadState_)) {
  3683. return ThreadState_;
  3684. }
  3685. if (ThreadStateDestroyed_) {
  3686. return nullptr;
  3687. }
  3688. InitializeGlobals();
  3689. // InitializeGlobals must not allocate.
  3690. Y_ABORT_UNLESS(!ThreadState_);
  3691. ThreadState_ = ThreadManager->AllocateThreadState();
  3692. (&ThreadControlWord_)->Parts.ThreadStateValid = true;
  3693. return ThreadState_;
  3694. }
  3695. void TThreadManager::DestroyThread(void*)
  3696. {
  3697. TSmallAllocator::PurgeCaches();
  3698. TThreadState* state = ThreadState_;
  3699. ThreadState_ = nullptr;
  3700. ThreadStateDestroyed_ = true;
  3701. (&ThreadControlWord_)->Parts.ThreadStateValid = false;
  3702. {
  3703. auto guard = GuardWithTiming(ThreadManager->ThreadRegistryLock_);
  3704. state->AllocationProfilingEnabled = nullptr;
  3705. state->BackgroundThreadStarted = nullptr;
  3706. ThreadManager->UnrefThreadState(state);
  3707. }
  3708. }
  3709. void TThreadManager::DestroyThreadState(TThreadState* state)
  3710. {
  3711. StatisticsManager->AccumulateLocalCounters(state);
  3712. ThreadRegistry_.Remove(state);
  3713. ThreadStatePool_.Free(state);
  3714. }
  3715. void TThreadManager::AfterFork()
  3716. {
  3717. auto guard = GuardWithTiming(ThreadRegistryLock_);
  3718. ThreadRegistry_.Clear();
  3719. TThreadState* state = ThreadState_;
  3720. if (state) {
  3721. ThreadRegistry_.PushBack(state);
  3722. }
  3723. }
  3724. TThreadState* TThreadManager::AllocateThreadState()
  3725. {
  3726. auto* state = ThreadStatePool_.Allocate();
  3727. state->AllocationProfilingEnabled = &(*&ThreadControlWord_).Parts.AllocationProfilingEnabled;
  3728. state->BackgroundThreadStarted = &(*&ThreadControlWord_).Parts.BackgroundThreadStarted;
  3729. {
  3730. auto guard = GuardWithTiming(ThreadRegistryLock_);
  3731. // NB: These flags must be initialized under ThreadRegistryLock_; see EnumerateThreadStatesSync.
  3732. *state->AllocationProfilingEnabled = ConfigurationManager->IsAllocationProfilingEnabled();
  3733. *state->BackgroundThreadStarted = BackgroundThread->IsStarted();
  3734. ThreadRegistry_.PushBack(state);
  3735. }
  3736. // Need to pass some non-null value for DestroyThread to be called.
  3737. pthread_setspecific(ThreadDtorKey_, (void*)-1);
  3738. return state;
  3739. }
  3740. ////////////////////////////////////////////////////////////////////////////////
  3741. void InitializeGlobals()
  3742. {
  3743. static std::once_flag Initialized;
  3744. std::call_once(Initialized, [] () {
  3745. LogManager.Construct();
  3746. BacktraceManager.Construct();
  3747. StatisticsManager.Construct();
  3748. MappedMemoryManager.Construct();
  3749. ThreadManager.Construct();
  3750. GlobalState.Construct();
  3751. DumpableLargeBlobAllocator.Construct();
  3752. UndumpableLargeBlobAllocator.Construct();
  3753. HugeBlobAllocator.Construct();
  3754. ConfigurationManager.Construct();
  3755. SystemAllocator.Construct();
  3756. TimingManager.Construct();
  3757. MmapObservationManager.Construct();
  3758. StockpileManager.Construct();
  3759. BackgroundThread.Construct();
  3760. SmallArenaAllocators.Construct();
  3761. auto constructSmallArenaAllocators = [&] (EAllocationKind kind, uintptr_t zonesStart) {
  3762. for (size_t rank = 1; rank < SmallRankCount; ++rank) {
  3763. (*SmallArenaAllocators)[kind][rank].Construct(kind, rank, zonesStart + rank * SmallZoneSize);
  3764. }
  3765. };
  3766. constructSmallArenaAllocators(EAllocationKind::Untagged, UntaggedSmallZonesStart);
  3767. constructSmallArenaAllocators(EAllocationKind::Tagged, TaggedSmallZonesStart);
  3768. GlobalSmallChunkCaches.Construct();
  3769. (*GlobalSmallChunkCaches)[EAllocationKind::Tagged].Construct(EAllocationKind::Tagged);
  3770. (*GlobalSmallChunkCaches)[EAllocationKind::Untagged].Construct(EAllocationKind::Untagged);
  3771. });
  3772. }
  3773. ////////////////////////////////////////////////////////////////////////////////
  3774. void StartBackgroundThread()
  3775. {
  3776. InitializeGlobals();
  3777. BackgroundThread->SpawnIfNeeded();
  3778. }
  3779. ////////////////////////////////////////////////////////////////////////////////
  3780. template <class... Ts>
  3781. Y_FORCE_INLINE void* AllocateSmallUntagged(size_t rank, Ts... args)
  3782. {
  3783. auto* result = TSmallAllocator::Allocate<EAllocationKind::Untagged>(NullMemoryTag, rank, std::forward<Ts>(args)...);
  3784. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxUntaggedSmallPtr);
  3785. return result;
  3786. }
  3787. template <class... Ts>
  3788. Y_FORCE_INLINE void* AllocateSmallTagged(ui64 controlWord, size_t rank, Ts... args)
  3789. {
  3790. auto tag = Y_UNLIKELY((controlWord & TThreadManager::AllocationProfilingEnabledControlWordMask) && ConfigurationManager->IsSmallArenaAllocationProfiled(rank))
  3791. ? BacktraceManager->GetMemoryTagFromBacktrace(2)
  3792. : static_cast<TMemoryTag>(controlWord & TThreadManager::MemoryTagControlWordMask);
  3793. auto* result = TSmallAllocator::Allocate<EAllocationKind::Tagged>(tag, rank, std::forward<Ts>(args)...);
  3794. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxTaggedSmallPtr);
  3795. return result;
  3796. }
  3797. Y_FORCE_INLINE void* AllocateInline(size_t size)
  3798. {
  3799. size_t rank;
  3800. if (Y_LIKELY(size <= 512)) {
  3801. rank = SizeToSmallRank1[(size + 7) >> 3];
  3802. } else if (Y_LIKELY(size < LargeAllocationSizeThreshold)) {
  3803. rank = SizeToSmallRank2[(size - 1) >> 8];
  3804. } else {
  3805. StartBackgroundThread();
  3806. return TBlobAllocator::Allocate(size);
  3807. }
  3808. auto controlWord = TThreadManager::GetThreadControlWord();
  3809. if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
  3810. return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
  3811. }
  3812. if (Y_UNLIKELY(!(controlWord & TThreadManager::BackgroundThreadStartedControlWorkMask))) {
  3813. StartBackgroundThread();
  3814. }
  3815. if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
  3816. return AllocateSmallUntagged(rank);
  3817. } else {
  3818. return AllocateSmallTagged(controlWord, rank);
  3819. }
  3820. }
  3821. Y_FORCE_INLINE void* AllocateSmallInline(size_t rank)
  3822. {
  3823. auto controlWord = TThreadManager::GetThreadControlWord();
  3824. if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
  3825. return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
  3826. }
  3827. if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
  3828. return AllocateSmallUntagged(rank);
  3829. } else {
  3830. return AllocateSmallTagged(controlWord, rank);
  3831. }
  3832. }
  3833. Y_FORCE_INLINE void* AllocatePageAlignedInline(size_t size)
  3834. {
  3835. size = std::max(AlignUp(size, PageSize), PageSize);
  3836. void* result = size >= LargeAllocationSizeThreshold
  3837. ? AlignUp(TBlobAllocator::Allocate(size + PageSize), PageSize)
  3838. : Allocate(size);
  3839. YTALLOC_ASSERT(reinterpret_cast<uintptr_t>(result) % PageSize == 0);
  3840. return result;
  3841. }
  3842. Y_FORCE_INLINE void FreeNonNullInline(void* ptr)
  3843. {
  3844. YTALLOC_ASSERT(ptr);
  3845. if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < UntaggedSmallZonesEnd)) {
  3846. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxUntaggedSmallPtr);
  3847. TSmallAllocator::Free<EAllocationKind::Untagged>(ptr);
  3848. } else if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < TaggedSmallZonesEnd)) {
  3849. YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxTaggedSmallPtr);
  3850. TSmallAllocator::Free<EAllocationKind::Tagged>(ptr);
  3851. } else {
  3852. TBlobAllocator::Free(ptr);
  3853. }
  3854. }
  3855. Y_FORCE_INLINE void FreeInline(void* ptr)
  3856. {
  3857. if (Y_LIKELY(ptr)) {
  3858. FreeNonNullInline(ptr);
  3859. }
  3860. }
  3861. Y_FORCE_INLINE size_t GetAllocationSizeInline(const void* ptr)
  3862. {
  3863. if (Y_UNLIKELY(!ptr)) {
  3864. return 0;
  3865. }
  3866. auto uintptr = reinterpret_cast<uintptr_t>(ptr);
  3867. if (uintptr < UntaggedSmallZonesEnd) {
  3868. YTALLOC_PARANOID_ASSERT(uintptr >= MinUntaggedSmallPtr && uintptr < MaxUntaggedSmallPtr);
  3869. return TSmallAllocator::GetAllocationSize(ptr);
  3870. } else if (uintptr < TaggedSmallZonesEnd) {
  3871. YTALLOC_PARANOID_ASSERT(uintptr >= MinTaggedSmallPtr && uintptr < MaxTaggedSmallPtr);
  3872. return TSmallAllocator::GetAllocationSize(ptr);
  3873. } else if (uintptr < LargeZoneEnd(true)) {
  3874. YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(true) && uintptr < LargeZoneEnd(true));
  3875. return TLargeBlobAllocator<true>::GetAllocationSize(ptr);
  3876. } else if (uintptr < LargeZoneEnd(false)) {
  3877. YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(false) && uintptr < LargeZoneEnd(false));
  3878. return TLargeBlobAllocator<false>::GetAllocationSize(ptr);
  3879. } else if (uintptr < HugeZoneEnd) {
  3880. YTALLOC_PARANOID_ASSERT(uintptr >= HugeZoneStart && uintptr < HugeZoneEnd);
  3881. return THugeBlobAllocator::GetAllocationSize(ptr);
  3882. } else {
  3883. YTALLOC_TRAP("Wrong ptr passed to GetAllocationSizeInline");
  3884. }
  3885. }
  3886. Y_FORCE_INLINE size_t GetAllocationSizeInline(size_t size)
  3887. {
  3888. if (size <= LargeAllocationSizeThreshold) {
  3889. return TSmallAllocator::GetAllocationSize(size);
  3890. } else if (size <= HugeAllocationSizeThreshold) {
  3891. return TLargeBlobAllocator<true>::GetAllocationSize(size);
  3892. } else {
  3893. return THugeBlobAllocator::GetAllocationSize(size);
  3894. }
  3895. }
  3896. void EnableLogging(TLogHandler logHandler)
  3897. {
  3898. InitializeGlobals();
  3899. LogManager->EnableLogging(logHandler);
  3900. }
  3901. void SetBacktraceProvider(TBacktraceProvider provider)
  3902. {
  3903. InitializeGlobals();
  3904. BacktraceManager->SetBacktraceProvider(provider);
  3905. DumpableLargeBlobAllocator->SetBacktraceProvider(provider);
  3906. UndumpableLargeBlobAllocator->SetBacktraceProvider(provider);
  3907. }
  3908. void SetBacktraceFormatter(TBacktraceFormatter provider)
  3909. {
  3910. InitializeGlobals();
  3911. MmapObservationManager->SetBacktraceFormatter(provider);
  3912. }
  3913. void EnableStockpile()
  3914. {
  3915. InitializeGlobals();
  3916. ConfigurationManager->EnableStockpile();
  3917. }
  3918. void SetStockpileInterval(TDuration value)
  3919. {
  3920. InitializeGlobals();
  3921. ConfigurationManager->SetStockpileInterval(value);
  3922. }
  3923. void SetStockpileThreadCount(int value)
  3924. {
  3925. InitializeGlobals();
  3926. ConfigurationManager->SetStockpileThreadCount(value);
  3927. }
  3928. void SetStockpileSize(size_t value)
  3929. {
  3930. InitializeGlobals();
  3931. ConfigurationManager->SetStockpileSize(value);
  3932. }
  3933. void SetLargeUnreclaimableCoeff(double value)
  3934. {
  3935. InitializeGlobals();
  3936. ConfigurationManager->SetLargeUnreclaimableCoeff(value);
  3937. }
  3938. void SetTimingEventThreshold(TDuration value)
  3939. {
  3940. InitializeGlobals();
  3941. ConfigurationManager->SetTimingEventThreshold(value);
  3942. }
  3943. void SetMinLargeUnreclaimableBytes(size_t value)
  3944. {
  3945. InitializeGlobals();
  3946. ConfigurationManager->SetMinLargeUnreclaimableBytes(value);
  3947. }
  3948. void SetMaxLargeUnreclaimableBytes(size_t value)
  3949. {
  3950. InitializeGlobals();
  3951. ConfigurationManager->SetMaxLargeUnreclaimableBytes(value);
  3952. }
  3953. void SetAllocationProfilingEnabled(bool value)
  3954. {
  3955. ConfigurationManager->SetAllocationProfilingEnabled(value);
  3956. }
  3957. void SetAllocationProfilingSamplingRate(double rate)
  3958. {
  3959. ConfigurationManager->SetAllocationProfilingSamplingRate(rate);
  3960. }
  3961. void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
  3962. {
  3963. ConfigurationManager->SetSmallArenaAllocationProfilingEnabled(rank, value);
  3964. }
  3965. void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
  3966. {
  3967. ConfigurationManager->SetLargeArenaAllocationProfilingEnabled(rank, value);
  3968. }
  3969. void SetProfilingBacktraceDepth(int depth)
  3970. {
  3971. ConfigurationManager->SetProfilingBacktraceDepth(depth);
  3972. }
  3973. void SetMinProfilingBytesUsedToReport(size_t size)
  3974. {
  3975. ConfigurationManager->SetMinProfilingBytesUsedToReport(size);
  3976. }
  3977. void SetEnableEagerMemoryRelease(bool value)
  3978. {
  3979. ConfigurationManager->SetEnableEagerMemoryRelease(value);
  3980. }
  3981. void SetEnableMadvisePopulate(bool value)
  3982. {
  3983. ConfigurationManager->SetEnableMadvisePopulate(value);
  3984. }
  3985. TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
  3986. {
  3987. InitializeGlobals();
  3988. return StatisticsManager->GetTotalAllocationCounters();
  3989. }
  3990. TEnumIndexedArray<ESystemCounter, ssize_t> GetSystemAllocationCounters()
  3991. {
  3992. InitializeGlobals();
  3993. return StatisticsManager->GetSystemAllocationCounters();
  3994. }
  3995. TEnumIndexedArray<ESystemCounter, ssize_t> GetUndumpableAllocationCounters()
  3996. {
  3997. InitializeGlobals();
  3998. return StatisticsManager->GetUndumpableAllocationCounters();
  3999. }
  4000. TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
  4001. {
  4002. InitializeGlobals();
  4003. return StatisticsManager->GetSmallAllocationCounters();
  4004. }
  4005. TEnumIndexedArray<ESmallCounter, ssize_t> GetLargeAllocationCounters()
  4006. {
  4007. InitializeGlobals();
  4008. return StatisticsManager->GetLargeAllocationCounters();
  4009. }
  4010. std::array<TEnumIndexedArray<ESmallArenaCounter, ssize_t>, SmallRankCount> GetSmallArenaAllocationCounters()
  4011. {
  4012. InitializeGlobals();
  4013. return StatisticsManager->GetSmallArenaAllocationCounters();
  4014. }
  4015. std::array<TEnumIndexedArray<ELargeArenaCounter, ssize_t>, LargeRankCount> GetLargeArenaAllocationCounters()
  4016. {
  4017. InitializeGlobals();
  4018. return StatisticsManager->GetLargeArenaAllocationCounters();
  4019. }
  4020. TEnumIndexedArray<EHugeCounter, ssize_t> GetHugeAllocationCounters()
  4021. {
  4022. InitializeGlobals();
  4023. return StatisticsManager->GetHugeAllocationCounters();
  4024. }
  4025. std::vector<TProfiledAllocation> GetProfiledAllocationStatistics()
  4026. {
  4027. InitializeGlobals();
  4028. if (!ConfigurationManager->IsAllocationProfilingEnabled()) {
  4029. return {};
  4030. }
  4031. std::vector<TMemoryTag> tags;
  4032. tags.reserve(MaxCapturedAllocationBacktraces + 1);
  4033. for (TMemoryTag tag = AllocationProfilingMemoryTagBase;
  4034. tag < AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
  4035. ++tag)
  4036. {
  4037. tags.push_back(tag);
  4038. }
  4039. tags.push_back(AllocationProfilingUnknownMemoryTag);
  4040. std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
  4041. counters.resize(tags.size());
  4042. StatisticsManager->GetTaggedMemoryCounters(tags.data(), tags.size(), counters.data());
  4043. std::vector<TProfiledAllocation> statistics;
  4044. for (size_t index = 0; index < tags.size(); ++index) {
  4045. if (counters[index][EBasicCounter::BytesUsed] < static_cast<ssize_t>(ConfigurationManager->GetMinProfilingBytesUsedToReport())) {
  4046. continue;
  4047. }
  4048. auto tag = tags[index];
  4049. auto optionalBacktrace = BacktraceManager->FindBacktrace(tag);
  4050. if (!optionalBacktrace && tag != AllocationProfilingUnknownMemoryTag) {
  4051. continue;
  4052. }
  4053. statistics.push_back(TProfiledAllocation{
  4054. optionalBacktrace.value_or(TBacktrace()),
  4055. counters[index]
  4056. });
  4057. }
  4058. return statistics;
  4059. }
  4060. TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
  4061. {
  4062. InitializeGlobals();
  4063. return TimingManager->GetTimingEventCounters();
  4064. }
  4065. ////////////////////////////////////////////////////////////////////////////////
  4066. } // namespace NYT::NYTAlloc