1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849 |
- #pragma once
- // This file contains the core parts of YTAlloc but no malloc/free-bridge.
- // The latter bridge is placed into alloc.cpp, which includes (sic!) core-inl.h.
- // This ensures that AllocateInline/FreeInline calls are properly inlined into malloc/free.
- // Also core-inl.h can be directly included in, e.g., benchmarks.
- #include <library/cpp/yt/containers/intrusive_linked_list.h>
- #include <library/cpp/yt/memory/memory_tag.h>
- #include <library/cpp/yt/threading/at_fork.h>
- #include <library/cpp/yt/threading/fork_aware_spin_lock.h>
- #include <library/cpp/yt/memory/free_list.h>
- #include <util/system/tls.h>
- #include <util/system/align.h>
- #include <util/system/thread.h>
- #include <util/string/printf.h>
- #include <util/generic/singleton.h>
- #include <util/generic/size_literals.h>
- #include <util/generic/utility.h>
- #include <util/digest/numeric.h>
- #include <library/cpp/ytalloc/api/ytalloc.h>
- #include <atomic>
- #include <array>
- #include <vector>
- #include <mutex>
- #include <thread>
- #include <condition_variable>
- #include <cstdio>
- #include <optional>
- #include <sys/mman.h>
- #ifdef _linux_
- #include <sys/utsname.h>
- #endif
- #include <errno.h>
- #include <pthread.h>
- #include <time.h>
- #ifndef MAP_POPULATE
- #define MAP_POPULATE 0x08000
- #endif
- // MAP_FIXED which doesn't unmap underlying mapping.
- // Linux kernels older than 4.17 silently ignore this flag.
- #ifndef MAP_FIXED_NOREPLACE
- #ifdef _linux_
- #define MAP_FIXED_NOREPLACE 0x100000
- #else
- #define MAP_FIXED_NOREPLACE 0
- #endif
- #endif
- #ifndef MADV_POPULATE
- #define MADV_POPULATE 0x59410003
- #endif
- #ifndef MADV_STOCKPILE
- #define MADV_STOCKPILE 0x59410004
- #endif
- #ifndef MADV_FREE
- #define MADV_FREE 8
- #endif
- #ifndef MADV_DONTDUMP
- #define MADV_DONTDUMP 16
- #endif
- #ifndef NDEBUG
- #define YTALLOC_PARANOID
- #endif
- #ifdef YTALLOC_PARANOID
- #define YTALLOC_NERVOUS
- #endif
- #define YTALLOC_VERIFY(condition) \
- do { \
- if (Y_UNLIKELY(!(condition))) { \
- ::NYT::NYTAlloc::AssertTrap("Assertion failed: " #condition, __FILE__, __LINE__); \
- } \
- } while (false)
- #ifdef NDEBUG
- #define YTALLOC_ASSERT(condition) YTALLOC_VERIFY(condition)
- #else
- #define YTALLOC_ASSERT(condition) (void)(0)
- #endif
- #ifdef YTALLOC_PARANOID
- #define YTALLOC_PARANOID_ASSERT(condition) YTALLOC_VERIFY(condition)
- #else
- #define YTALLOC_PARANOID_ASSERT(condition) (true || (condition))
- #endif
- #define YTALLOC_TRAP(message) ::NYT::NYTAlloc::AssertTrap(message, __FILE__, __LINE__)
- namespace NYT::NYTAlloc {
- ////////////////////////////////////////////////////////////////////////////////
- // Allocations are classified into three types:
- //
- // a) Small chunks (less than LargeAllocationSizeThreshold)
- // These are the fastest and are extensively cached (both per-thread and globally).
- // Memory claimed for these allocations is never reclaimed back.
- // Code dealing with such allocations is heavy optimized with all hot paths
- // as streamlined as possible. The implementation is mostly inspired by LFAlloc.
- //
- // b) Large blobs (from LargeAllocationSizeThreshold to HugeAllocationSizeThreshold)
- // These are cached as well. We expect such allocations to be less frequent
- // than small ones but still do our best to provide good scalability.
- // In particular, thread-sharded concurrent data structures as used to provide access to
- // cached blobs. Memory is claimed via madvise(MADV_POPULATE) and reclaimed back
- // via madvise(MADV_FREE).
- //
- // c) Huge blobs (from HugeAllocationSizeThreshold)
- // These should be rare; we delegate directly to mmap and munmap for each allocation.
- //
- // We also provide a separate allocator for all system allocations (that are needed by YTAlloc itself).
- // These are rare and also delegate to mmap/unmap.
- // Periods between background activities.
- constexpr auto BackgroundInterval = TDuration::Seconds(1);
- static_assert(LargeRankCount - MinLargeRank <= 16, "Too many large ranks");
- static_assert(SmallRankCount <= 32, "Too many small ranks");
- constexpr size_t SmallZoneSize = 1_TB;
- constexpr size_t LargeZoneSize = 16_TB;
- constexpr size_t HugeZoneSize = 1_TB;
- constexpr size_t SystemZoneSize = 1_TB;
- constexpr size_t MaxCachedChunksPerRank = 256;
- constexpr uintptr_t UntaggedSmallZonesStart = 0;
- constexpr uintptr_t UntaggedSmallZonesEnd = UntaggedSmallZonesStart + 32 * SmallZoneSize;
- constexpr uintptr_t MinUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * 1;
- constexpr uintptr_t MaxUntaggedSmallPtr = UntaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
- constexpr uintptr_t TaggedSmallZonesStart = UntaggedSmallZonesEnd;
- constexpr uintptr_t TaggedSmallZonesEnd = TaggedSmallZonesStart + 32 * SmallZoneSize;
- constexpr uintptr_t MinTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * 1;
- constexpr uintptr_t MaxTaggedSmallPtr = TaggedSmallZonesStart + SmallZoneSize * SmallRankCount;
- constexpr uintptr_t DumpableLargeZoneStart = TaggedSmallZonesEnd;
- constexpr uintptr_t DumpableLargeZoneEnd = DumpableLargeZoneStart + LargeZoneSize;
- constexpr uintptr_t UndumpableLargeZoneStart = DumpableLargeZoneEnd;
- constexpr uintptr_t UndumpableLargeZoneEnd = UndumpableLargeZoneStart + LargeZoneSize;
- constexpr uintptr_t LargeZoneStart(bool dumpable)
- {
- return dumpable ? DumpableLargeZoneStart : UndumpableLargeZoneStart;
- }
- constexpr uintptr_t LargeZoneEnd(bool dumpable)
- {
- return dumpable ? DumpableLargeZoneEnd : UndumpableLargeZoneEnd;
- }
- constexpr uintptr_t HugeZoneStart = UndumpableLargeZoneEnd;
- constexpr uintptr_t HugeZoneEnd = HugeZoneStart + HugeZoneSize;
- constexpr uintptr_t SystemZoneStart = HugeZoneEnd;
- constexpr uintptr_t SystemZoneEnd = SystemZoneStart + SystemZoneSize;
- // We leave 64_KB at the end of 256_MB block and never use it.
- // That serves two purposes:
- // 1. SmallExtentSize % SmallSegmentSize == 0
- // 2. Every small object satisfies RightReadableArea requirement.
- constexpr size_t SmallExtentAllocSize = 256_MB;
- constexpr size_t SmallExtentSize = SmallExtentAllocSize - 64_KB;
- constexpr size_t SmallSegmentSize = 96_KB; // LCM(SmallRankToSize)
- constexpr ui16 SmallRankBatchSize[SmallRankCount] = {
- 0, 256, 256, 256, 256, 256, 256, 256, 256, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3
- };
- constexpr bool CheckSmallSizes()
- {
- for (size_t rank = 0; rank < SmallRankCount; rank++) {
- auto size = SmallRankToSize[rank];
- if (size == 0) {
- continue;
- }
- if (SmallSegmentSize % size != 0) {
- return false;
- }
- if (SmallRankBatchSize[rank] > MaxCachedChunksPerRank) {
- return false;
- }
- }
- return true;
- }
- static_assert(CheckSmallSizes());
- static_assert(SmallExtentSize % SmallSegmentSize == 0);
- static_assert(SmallSegmentSize % PageSize == 0);
- constexpr size_t LargeExtentSize = 1_GB;
- static_assert(LargeExtentSize >= LargeAllocationSizeThreshold, "LargeExtentSize < LargeAllocationSizeThreshold");
- constexpr const char* BackgroundThreadName = "YTAllocBack";
- constexpr const char* StockpileThreadName = "YTAllocStock";
- DEFINE_ENUM(EAllocationKind,
- (Untagged)
- (Tagged)
- );
- // Forward declarations.
- struct TThreadState;
- struct TLargeArena;
- struct TLargeBlobExtent;
- ////////////////////////////////////////////////////////////////////////////////
- // Traps and assertions
- [[noreturn]]
- void OomTrap()
- {
- _exit(9);
- }
- [[noreturn]]
- void AssertTrap(const char* message, const char* file, int line)
- {
- ::fprintf(stderr, "*** YTAlloc has detected an internal trap at %s:%d\n*** %s\n",
- file,
- line,
- message);
- __builtin_trap();
- }
- template <class T, class E>
- void AssertBlobState(T* header, E expectedState)
- {
- auto actualState = header->State;
- if (Y_UNLIKELY(actualState != expectedState)) {
- char message[256];
- sprintf(message, "Invalid blob header state at %p: expected %" PRIx64 ", actual %" PRIx64,
- header,
- static_cast<ui64>(expectedState),
- static_cast<ui64>(actualState));
- YTALLOC_TRAP(message);
- }
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Provides a never-dying singleton with explicit construction.
- template <class T>
- class TExplicitlyConstructableSingleton
- {
- public:
- TExplicitlyConstructableSingleton()
- { }
- ~TExplicitlyConstructableSingleton()
- { }
- template <class... Ts>
- void Construct(Ts&&... args)
- {
- new (&Storage_) T(std::forward<Ts>(args)...);
- #ifndef NDEBUG
- Constructed_ = true;
- #endif
- }
- Y_FORCE_INLINE T* Get()
- {
- #ifndef NDEBUG
- YTALLOC_PARANOID_ASSERT(Constructed_);
- #endif
- return &Storage_;
- }
- Y_FORCE_INLINE const T* Get() const
- {
- #ifndef NDEBUG
- YTALLOC_PARANOID_ASSERT(Constructed_);
- #endif
- return &Storage_;
- }
- Y_FORCE_INLINE T* operator->()
- {
- return Get();
- }
- Y_FORCE_INLINE const T* operator->() const
- {
- return Get();
- }
- Y_FORCE_INLINE T& operator*()
- {
- return *Get();
- }
- Y_FORCE_INLINE const T& operator*() const
- {
- return *Get();
- }
- private:
- union {
- T Storage_;
- };
- #ifndef NDEBUG
- bool Constructed_;
- #endif
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Initializes all singletons.
- // Safe to call multiple times.
- // Guaranteed to not allocate.
- void InitializeGlobals();
- // Spawns the background thread, if it's time.
- // Safe to call multiple times.
- // Must be called on allocation slow path.
- void StartBackgroundThread();
- ////////////////////////////////////////////////////////////////////////////////
- class TLogManager
- {
- public:
- // Sets the handler to be invoked for each log event produced by YTAlloc.
- void EnableLogging(TLogHandler logHandler)
- {
- LogHandler_.store(logHandler);
- }
- // Checks (in a racy way) that logging is enabled.
- bool IsLoggingEnabled()
- {
- return LogHandler_.load() != nullptr;
- }
- // Logs the message via log handler (if any).
- template <class... Ts>
- void LogMessage(ELogEventSeverity severity, const char* format, Ts&&... args)
- {
- auto logHandler = LogHandler_.load();
- if (!logHandler) {
- return;
- }
- std::array<char, 16_KB> buffer;
- auto len = ::snprintf(buffer.data(), buffer.size(), format, std::forward<Ts>(args)...);
- TLogEvent event;
- event.Severity = severity;
- event.Message = TStringBuf(buffer.data(), len);
- logHandler(event);
- }
- // A special case of zero args.
- void LogMessage(ELogEventSeverity severity, const char* message)
- {
- LogMessage(severity, "%s", message);
- }
- private:
- std::atomic<TLogHandler> LogHandler_= nullptr;
- };
- TExplicitlyConstructableSingleton<TLogManager> LogManager;
- #define YTALLOC_LOG_EVENT(...) LogManager->LogMessage(__VA_ARGS__)
- #define YTALLOC_LOG_DEBUG(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Debug, __VA_ARGS__)
- #define YTALLOC_LOG_INFO(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Info, __VA_ARGS__)
- #define YTALLOC_LOG_WARNING(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Warning, __VA_ARGS__)
- #define YTALLOC_LOG_ERROR(...) YTALLOC_LOG_EVENT(ELogEventSeverity::Error, __VA_ARGS__)
- ////////////////////////////////////////////////////////////////////////////////
- Y_FORCE_INLINE size_t GetUsed(ssize_t allocated, ssize_t freed)
- {
- return allocated >= freed ? static_cast<size_t>(allocated - freed) : 0;
- }
- template <class T>
- Y_FORCE_INLINE void* HeaderToPtr(T* header)
- {
- return header + 1;
- }
- template <class T>
- Y_FORCE_INLINE T* PtrToHeader(void* ptr)
- {
- return static_cast<T*>(ptr) - 1;
- }
- template <class T>
- Y_FORCE_INLINE const T* PtrToHeader(const void* ptr)
- {
- return static_cast<const T*>(ptr) - 1;
- }
- Y_FORCE_INLINE size_t PtrToSmallRank(const void* ptr)
- {
- return (reinterpret_cast<uintptr_t>(ptr) >> 40) & 0x1f;
- }
- Y_FORCE_INLINE char* AlignDownToSmallSegment(char* extent, char* ptr)
- {
- auto offset = static_cast<uintptr_t>(ptr - extent);
- // NB: This modulo operation is always performed using multiplication.
- offset -= (offset % SmallSegmentSize);
- return extent + offset;
- }
- Y_FORCE_INLINE char* AlignUpToSmallSegment(char* extent, char* ptr)
- {
- return AlignDownToSmallSegment(extent, ptr + SmallSegmentSize - 1);
- }
- template <class T>
- static Y_FORCE_INLINE void UnalignPtr(void*& ptr)
- {
- if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
- reinterpret_cast<char*&>(ptr) -= PageSize - sizeof (T);
- }
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
- }
- template <class T>
- static Y_FORCE_INLINE void UnalignPtr(const void*& ptr)
- {
- if (reinterpret_cast<uintptr_t>(ptr) % PageSize == 0) {
- reinterpret_cast<const char*&>(ptr) -= PageSize - sizeof (T);
- }
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) % PageSize == sizeof (T));
- }
- template <class T>
- Y_FORCE_INLINE size_t GetRawBlobSize(size_t size)
- {
- return AlignUp(size + sizeof (T) + RightReadableAreaSize, PageSize);
- }
- template <class T>
- Y_FORCE_INLINE size_t GetBlobAllocationSize(size_t size)
- {
- size += sizeof(T);
- size += RightReadableAreaSize;
- size = AlignUp(size, PageSize);
- size -= sizeof(T);
- size -= RightReadableAreaSize;
- return size;
- }
- Y_FORCE_INLINE size_t GetLargeRank(size_t size)
- {
- size_t rank = 64 - __builtin_clzl(size);
- if (size == (1ULL << (rank - 1))) {
- --rank;
- }
- return rank;
- }
- Y_FORCE_INLINE void PoisonRange(void* ptr, size_t size, ui32 magic)
- {
- #ifdef YTALLOC_PARANOID
- size = ::AlignUp<size_t>(size, 4);
- std::fill(static_cast<ui32*>(ptr), static_cast<ui32*>(ptr) + size / 4, magic);
- #else
- Y_UNUSED(ptr);
- Y_UNUSED(size);
- Y_UNUSED(magic);
- #endif
- }
- Y_FORCE_INLINE void PoisonFreedRange(void* ptr, size_t size)
- {
- PoisonRange(ptr, size, 0xdeadbeef);
- }
- Y_FORCE_INLINE void PoisonUninitializedRange(void* ptr, size_t size)
- {
- PoisonRange(ptr, size, 0xcafebabe);
- }
- // Checks that the header size is divisible by 16 (as needed due to alignment restrictions).
- #define CHECK_HEADER_ALIGNMENT(T) static_assert(sizeof(T) % 16 == 0, "sizeof(" #T ") % 16 != 0");
- ////////////////////////////////////////////////////////////////////////////////
- static_assert(sizeof(TFreeList<void>) == CacheLineSize, "sizeof(TFreeList) != CacheLineSize");
- ////////////////////////////////////////////////////////////////////////////////
- constexpr size_t ShardCount = 16;
- std::atomic<size_t> GlobalCurrentShardIndex;
- // Provides a context for working with sharded data structures.
- // Captures the initial shard index upon construction (indicating the shard
- // where all insertions go). Maintains the current shard index (round-robin,
- // indicating the shard currently used for extraction).
- // Can be or be not thread-safe depending on TCounter.
- template <class TCounter>
- class TShardedState
- {
- public:
- TShardedState()
- : InitialShardIndex_(GlobalCurrentShardIndex++ % ShardCount)
- , CurrentShardIndex_(InitialShardIndex_)
- { }
- Y_FORCE_INLINE size_t GetInitialShardIndex() const
- {
- return InitialShardIndex_;
- }
- Y_FORCE_INLINE size_t GetNextShardIndex()
- {
- return ++CurrentShardIndex_ % ShardCount;
- }
- private:
- const size_t InitialShardIndex_;
- TCounter CurrentShardIndex_;
- };
- using TLocalShardedState = TShardedState<size_t>;
- using TGlobalShardedState = TShardedState<std::atomic<size_t>>;
- // Implemented as a collection of free lists (each called a shard).
- // One needs TShardedState to access the sharded data structure.
- template <class T>
- class TShardedFreeList
- {
- public:
- // First tries to extract an item from the initial shard;
- // if failed then proceeds to all shards in round-robin fashion.
- template <class TState>
- T* Extract(TState* state)
- {
- if (auto* item = Shards_[state->GetInitialShardIndex()].Extract()) {
- return item;
- }
- return ExtractRoundRobin(state);
- }
- // Attempts to extract an item from all shards in round-robin fashion.
- template <class TState>
- T* ExtractRoundRobin(TState* state)
- {
- for (size_t index = 0; index < ShardCount; ++index) {
- if (auto* item = Shards_[state->GetNextShardIndex()].Extract()) {
- return item;
- }
- }
- return nullptr;
- }
- // Extracts items from all shards linking them together.
- T* ExtractAll()
- {
- T* head = nullptr;
- T* tail = nullptr;
- for (auto& shard : Shards_) {
- auto* item = shard.ExtractAll();
- if (!head) {
- head = item;
- }
- if (tail) {
- YTALLOC_PARANOID_ASSERT(!tail->Next);
- tail->Next = item;
- } else {
- tail = item;
- }
- while (tail && tail->Next) {
- tail = tail->Next;
- }
- }
- return head;
- }
- template <class TState>
- void Put(TState* state, T* item)
- {
- Shards_[state->GetInitialShardIndex()].Put(item);
- }
- private:
- std::array<TFreeList<T>, ShardCount> Shards_;
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Holds YTAlloc control knobs.
- // Thread safe.
- class TConfigurationManager
- {
- public:
- void SetLargeUnreclaimableCoeff(double value)
- {
- LargeUnreclaimableCoeff_.store(value);
- }
- double GetLargeUnreclaimableCoeff() const
- {
- return LargeUnreclaimableCoeff_.load(std::memory_order_relaxed);
- }
- void SetMinLargeUnreclaimableBytes(size_t value)
- {
- MinLargeUnreclaimableBytes_.store(value);
- }
- void SetMaxLargeUnreclaimableBytes(size_t value)
- {
- MaxLargeUnreclaimableBytes_.store(value);
- }
- size_t GetMinLargeUnreclaimableBytes() const
- {
- return MinLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
- }
- size_t GetMaxLargeUnreclaimableBytes() const
- {
- return MaxLargeUnreclaimableBytes_.load(std::memory_order_relaxed);
- }
- void SetTimingEventThreshold(TDuration value)
- {
- TimingEventThresholdNs_.store(value.MicroSeconds() * 1000);
- }
- i64 GetTimingEventThresholdNs() const
- {
- return TimingEventThresholdNs_.load(std::memory_order_relaxed);
- }
- void SetAllocationProfilingEnabled(bool value);
- bool IsAllocationProfilingEnabled() const
- {
- return AllocationProfilingEnabled_.load();
- }
- Y_FORCE_INLINE bool GetAllocationProfilingSamplingRate()
- {
- return AllocationProfilingSamplingRate_.load();
- }
- void SetAllocationProfilingSamplingRate(double rate)
- {
- if (rate < 0) {
- rate = 0;
- }
- if (rate > 1) {
- rate = 1;
- }
- i64 rateX64K = static_cast<i64>(rate * (1ULL << 16));
- AllocationProfilingSamplingRateX64K_.store(ClampVal<ui32>(rateX64K, 0, std::numeric_limits<ui16>::max() + 1));
- AllocationProfilingSamplingRate_.store(rate);
- }
- Y_FORCE_INLINE bool IsSmallArenaAllocationProfilingEnabled(size_t rank)
- {
- return SmallArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
- }
- Y_FORCE_INLINE bool IsSmallArenaAllocationProfiled(size_t rank)
- {
- return IsSmallArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
- }
- void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
- {
- if (rank >= SmallRankCount) {
- return;
- }
- SmallArenaAllocationProfilingEnabled_[rank].store(value);
- }
- Y_FORCE_INLINE bool IsLargeArenaAllocationProfilingEnabled(size_t rank)
- {
- return LargeArenaAllocationProfilingEnabled_[rank].load(std::memory_order_relaxed);
- }
- Y_FORCE_INLINE bool IsLargeArenaAllocationProfiled(size_t rank)
- {
- return IsLargeArenaAllocationProfilingEnabled(rank) && IsAllocationSampled();
- }
- void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
- {
- if (rank >= LargeRankCount) {
- return;
- }
- LargeArenaAllocationProfilingEnabled_[rank].store(value);
- }
- Y_FORCE_INLINE int GetProfilingBacktraceDepth()
- {
- return ProfilingBacktraceDepth_.load();
- }
- void SetProfilingBacktraceDepth(int depth)
- {
- if (depth < 1) {
- return;
- }
- if (depth > MaxAllocationProfilingBacktraceDepth) {
- depth = MaxAllocationProfilingBacktraceDepth;
- }
- ProfilingBacktraceDepth_.store(depth);
- }
- Y_FORCE_INLINE size_t GetMinProfilingBytesUsedToReport()
- {
- return MinProfilingBytesUsedToReport_.load();
- }
- void SetMinProfilingBytesUsedToReport(size_t size)
- {
- MinProfilingBytesUsedToReport_.store(size);
- }
- void SetEnableEagerMemoryRelease(bool value)
- {
- EnableEagerMemoryRelease_.store(value);
- }
- bool GetEnableEagerMemoryRelease()
- {
- return EnableEagerMemoryRelease_.load(std::memory_order_relaxed);
- }
- void SetEnableMadvisePopulate(bool value)
- {
- EnableMadvisePopulate_.store(value);
- }
- bool GetEnableMadvisePopulate()
- {
- return EnableMadvisePopulate_.load(std::memory_order_relaxed);
- }
- void EnableStockpile()
- {
- StockpileEnabled_.store(true);
- }
- bool IsStockpileEnabled()
- {
- return StockpileEnabled_.load();
- }
- void SetStockpileInterval(TDuration value)
- {
- StockpileInterval_.store(value);
- }
- TDuration GetStockpileInterval()
- {
- return StockpileInterval_.load();
- }
- void SetStockpileThreadCount(int count)
- {
- StockpileThreadCount_.store(count);
- }
- int GetStockpileThreadCount()
- {
- return ClampVal(StockpileThreadCount_.load(), 0, MaxStockpileThreadCount);
- }
- void SetStockpileSize(size_t value)
- {
- StockpileSize_.store(value);
- }
- size_t GetStockpileSize()
- {
- return StockpileSize_.load();
- }
- private:
- std::atomic<double> LargeUnreclaimableCoeff_ = 0.05;
- std::atomic<size_t> MinLargeUnreclaimableBytes_ = 128_MB;
- std::atomic<size_t> MaxLargeUnreclaimableBytes_ = 10_GB;
- std::atomic<i64> TimingEventThresholdNs_ = 10000000; // in ns, 10 ms by default
- std::atomic<bool> AllocationProfilingEnabled_ = false;
- std::atomic<double> AllocationProfilingSamplingRate_ = 1.0;
- std::atomic<ui32> AllocationProfilingSamplingRateX64K_ = std::numeric_limits<ui32>::max();
- std::array<std::atomic<bool>, SmallRankCount> SmallArenaAllocationProfilingEnabled_ = {};
- std::array<std::atomic<bool>, LargeRankCount> LargeArenaAllocationProfilingEnabled_ = {};
- std::atomic<int> ProfilingBacktraceDepth_ = 10;
- std::atomic<size_t> MinProfilingBytesUsedToReport_ = 1_MB;
- std::atomic<bool> EnableEagerMemoryRelease_ = true;
- std::atomic<bool> EnableMadvisePopulate_ = false;
- std::atomic<bool> StockpileEnabled_ = false;
- std::atomic<TDuration> StockpileInterval_ = TDuration::MilliSeconds(10);
- static constexpr int MaxStockpileThreadCount = 8;
- std::atomic<int> StockpileThreadCount_ = 4;
- std::atomic<size_t> StockpileSize_ = 1_GB;
- private:
- bool IsAllocationSampled()
- {
- Y_POD_STATIC_THREAD(ui16) Counter;
- return Counter++ < AllocationProfilingSamplingRateX64K_.load();
- }
- };
- TExplicitlyConstructableSingleton<TConfigurationManager> ConfigurationManager;
- ////////////////////////////////////////////////////////////////////////////////
- template <class TEvent, class TManager>
- class TEventLogManagerBase
- {
- public:
- void DisableForCurrentThread()
- {
- TManager::DisabledForCurrentThread_ = true;
- }
- template <class... TArgs>
- void EnqueueEvent(TArgs&&... args)
- {
- if (TManager::DisabledForCurrentThread_) {
- return;
- }
- auto timestamp = TInstant::Now();
- auto fiberId = NYTAlloc::GetCurrentFiberId();
- auto guard = Guard(EventLock_);
- auto event = TEvent(args...);
- OnEvent(event);
- if (EventCount_ >= EventBufferSize) {
- return;
- }
- auto& enqueuedEvent = Events_[EventCount_++];
- enqueuedEvent = std::move(event);
- enqueuedEvent.Timestamp = timestamp;
- enqueuedEvent.FiberId = fiberId;
- }
- void RunBackgroundTasks()
- {
- if (LogManager->IsLoggingEnabled()) {
- for (const auto& event : PullEvents()) {
- ProcessEvent(event);
- }
- }
- }
- protected:
- NThreading::TForkAwareSpinLock EventLock_;
- virtual void OnEvent(const TEvent& event) = 0;
- virtual void ProcessEvent(const TEvent& event) = 0;
- private:
- static constexpr size_t EventBufferSize = 1000;
- size_t EventCount_ = 0;
- std::array<TEvent, EventBufferSize> Events_;
- std::vector<TEvent> PullEvents()
- {
- std::vector<TEvent> events;
- events.reserve(EventBufferSize);
- auto guard = Guard(EventLock_);
- for (size_t index = 0; index < EventCount_; ++index) {
- events.push_back(Events_[index]);
- }
- EventCount_ = 0;
- return events;
- }
- };
- ////////////////////////////////////////////////////////////////////////////////
- struct TTimingEvent
- {
- ETimingEventType Type;
- TDuration Duration;
- size_t Size;
- TInstant Timestamp;
- TFiberId FiberId;
- TTimingEvent()
- { }
- TTimingEvent(
- ETimingEventType type,
- TDuration duration,
- size_t size)
- : Type(type)
- , Duration(duration)
- , Size(size)
- { }
- };
- class TTimingManager
- : public TEventLogManagerBase<TTimingEvent, TTimingManager>
- {
- public:
- TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
- {
- auto guard = Guard(EventLock_);
- return EventCounters_;
- }
- private:
- TEnumIndexedArray<ETimingEventType, TTimingEventCounters> EventCounters_;
- Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
- friend class TEventLogManagerBase<TTimingEvent, TTimingManager>;
- virtual void OnEvent(const TTimingEvent& event) override
- {
- auto& counters = EventCounters_[event.Type];
- counters.Count += 1;
- counters.Size += event.Size;
- }
- virtual void ProcessEvent(const TTimingEvent& event) override
- {
- YTALLOC_LOG_DEBUG("Timing event logged (Type: %s, Duration: %s, Size: %zu, Timestamp: %s, FiberId: %" PRIu64 ")",
- ToString(event.Type).c_str(),
- ToString(event.Duration).c_str(),
- event.Size,
- ToString(event.Timestamp).c_str(),
- event.FiberId);
- }
- };
- Y_POD_THREAD(bool) TTimingManager::DisabledForCurrentThread_;
- TExplicitlyConstructableSingleton<TTimingManager> TimingManager;
- ////////////////////////////////////////////////////////////////////////////////
- i64 GetElapsedNs(const struct timespec& startTime, const struct timespec& endTime)
- {
- if (Y_LIKELY(startTime.tv_sec == endTime.tv_sec)) {
- return static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec);
- }
- return
- static_cast<i64>(endTime.tv_nsec) - static_cast<i64>(startTime.tv_nsec) +
- (static_cast<i64>(endTime.tv_sec) - static_cast<i64>(startTime.tv_sec)) * 1000000000;
- }
- // Used to log statistics about long-running syscalls and lock acquisitions.
- class TTimingGuard
- : public TNonCopyable
- {
- public:
- explicit TTimingGuard(ETimingEventType eventType, size_t size = 0)
- : EventType_(eventType)
- , Size_(size)
- {
- ::clock_gettime(CLOCK_MONOTONIC, &StartTime_);
- }
- ~TTimingGuard()
- {
- auto elapsedNs = GetElapsedNs();
- if (elapsedNs > ConfigurationManager->GetTimingEventThresholdNs()) {
- TimingManager->EnqueueEvent(EventType_, TDuration::MicroSeconds(elapsedNs / 1000), Size_);
- }
- }
- private:
- const ETimingEventType EventType_;
- const size_t Size_;
- struct timespec StartTime_;
- i64 GetElapsedNs() const
- {
- struct timespec endTime;
- ::clock_gettime(CLOCK_MONOTONIC, &endTime);
- return NYTAlloc::GetElapsedNs(StartTime_, endTime);
- }
- };
- template <class T>
- Y_FORCE_INLINE TGuard<T> GuardWithTiming(const T& lock)
- {
- TTimingGuard timingGuard(ETimingEventType::Locking);
- TGuard<T> lockingGuard(lock);
- return lockingGuard;
- }
- ////////////////////////////////////////////////////////////////////////////////
- // A wrapper for mmap, mumap, and madvise calls.
- // The latter are invoked with MADV_POPULATE (if enabled) and MADV_FREE flags
- // and may fail if the OS support is missing. These failures are logged (once) and
- // handled as follows:
- // * if MADV_POPULATE fails then we fallback to manual per-page prefault
- // for all subsequent attempts;
- // * if MADV_FREE fails then it (and all subsequent attempts) is replaced with MADV_DONTNEED
- // (which is non-lazy and is less efficient but will somehow do).
- // Also this class mlocks all VMAs on startup to prevent pagefaults in our heavy binaries
- // from disturbing latency tails.
- class TMappedMemoryManager
- {
- public:
- void* Map(uintptr_t hint, size_t size, int flags)
- {
- TTimingGuard timingGuard(ETimingEventType::Mmap, size);
- auto* result = ::mmap(
- reinterpret_cast<void*>(hint),
- size,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | flags,
- -1,
- 0);
- if (result == MAP_FAILED) {
- auto error = errno;
- if (error == EEXIST && (flags & MAP_FIXED_NOREPLACE)) {
- // Caller must retry with different hint address.
- return result;
- }
- YTALLOC_VERIFY(error == ENOMEM);
- ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to mmap %zu bytes\n",
- size);
- OomTrap();
- }
- return result;
- }
- void Unmap(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::Munmap, size);
- auto result = ::munmap(ptr, size);
- YTALLOC_VERIFY(result == 0);
- }
- void DontDump(void* ptr, size_t size)
- {
- auto result = ::madvise(ptr, size, MADV_DONTDUMP);
- // Must not fail.
- YTALLOC_VERIFY(result == 0);
- }
- void PopulateFile(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::FilePrefault, size);
- auto* begin = static_cast<volatile char*>(ptr);
- for (auto* current = begin; current < begin + size; current += PageSize) {
- *current;
- }
- }
- void PopulateReadOnly(void* ptr, size_t size)
- {
- if (!MadvisePopulateUnavailable_.load(std::memory_order_relaxed) &&
- ConfigurationManager->GetEnableMadvisePopulate())
- {
- if (!TryMadvisePopulate(ptr, size)) {
- MadvisePopulateUnavailable_.store(true);
- }
- }
- }
- void Populate(void* ptr, size_t size)
- {
- if (MadvisePopulateUnavailable_.load(std::memory_order_relaxed) ||
- !ConfigurationManager->GetEnableMadvisePopulate())
- {
- DoPrefault(ptr, size);
- } else if (!TryMadvisePopulate(ptr, size)) {
- MadvisePopulateUnavailable_.store(true);
- DoPrefault(ptr, size);
- }
- }
- void Release(void* ptr, size_t size)
- {
- if (CanUseMadviseFree() && !ConfigurationManager->GetEnableEagerMemoryRelease()) {
- DoMadviseFree(ptr, size);
- } else {
- DoMadviseDontNeed(ptr, size);
- }
- }
- bool Stockpile(size_t size)
- {
- if (MadviseStockpileUnavailable_.load(std::memory_order_relaxed)) {
- return false;
- }
- if (!TryMadviseStockpile(size)) {
- MadviseStockpileUnavailable_.store(true);
- return false;
- }
- return true;
- }
- void RunBackgroundTasks()
- {
- if (!LogManager->IsLoggingEnabled()) {
- return;
- }
- if (IsBuggyKernel() && !BuggyKernelLogged_) {
- YTALLOC_LOG_WARNING("Kernel is buggy; see KERNEL-118");
- BuggyKernelLogged_ = true;
- }
- if (MadviseFreeSupported_ && !MadviseFreeSupportedLogged_) {
- YTALLOC_LOG_INFO("MADV_FREE is supported");
- MadviseFreeSupportedLogged_ = true;
- }
- if (MadviseFreeNotSupported_ && !MadviseFreeNotSupportedLogged_) {
- YTALLOC_LOG_WARNING("MADV_FREE is not supported");
- MadviseFreeNotSupportedLogged_ = true;
- }
- if (MadvisePopulateUnavailable_.load() && !MadvisePopulateUnavailableLogged_) {
- YTALLOC_LOG_WARNING("MADV_POPULATE is not supported");
- MadvisePopulateUnavailableLogged_ = true;
- }
- if (MadviseStockpileUnavailable_.load() && !MadviseStockpileUnavailableLogged_) {
- YTALLOC_LOG_WARNING("MADV_STOCKPILE is not supported");
- MadviseStockpileUnavailableLogged_ = true;
- }
- }
- private:
- bool BuggyKernelLogged_ = false;
- std::atomic<bool> MadviseFreeSupported_ = false;
- bool MadviseFreeSupportedLogged_ = false;
- std::atomic<bool> MadviseFreeNotSupported_ = false;
- bool MadviseFreeNotSupportedLogged_ = false;
- std::atomic<bool> MadvisePopulateUnavailable_ = false;
- bool MadvisePopulateUnavailableLogged_ = false;
- std::atomic<bool> MadviseStockpileUnavailable_ = false;
- bool MadviseStockpileUnavailableLogged_ = false;
- private:
- bool TryMadvisePopulate(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::MadvisePopulate, size);
- auto result = ::madvise(ptr, size, MADV_POPULATE);
- if (result != 0) {
- auto error = errno;
- YTALLOC_VERIFY(error == EINVAL || error == ENOMEM);
- if (error == ENOMEM) {
- ::fprintf(stderr, "*** YTAlloc has received ENOMEM error while trying to madvise(MADV_POPULATE) %zu bytes\n",
- size);
- OomTrap();
- }
- return false;
- }
- return true;
- }
- void DoPrefault(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::Prefault, size);
- auto* begin = static_cast<char*>(ptr);
- for (auto* current = begin; current < begin + size; current += PageSize) {
- *current = 0;
- }
- }
- bool CanUseMadviseFree()
- {
- if (MadviseFreeSupported_.load()) {
- return true;
- }
- if (MadviseFreeNotSupported_.load()) {
- return false;
- }
- if (IsBuggyKernel()) {
- MadviseFreeNotSupported_.store(true);
- } else {
- auto* ptr = Map(0, PageSize, 0);
- if (::madvise(ptr, PageSize, MADV_FREE) == 0) {
- MadviseFreeSupported_.store(true);
- } else {
- MadviseFreeNotSupported_.store(true);
- }
- Unmap(ptr, PageSize);
- }
- // Will not recurse.
- return CanUseMadviseFree();
- }
- void DoMadviseDontNeed(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::MadviseDontNeed, size);
- auto result = ::madvise(ptr, size, MADV_DONTNEED);
- if (result != 0) {
- auto error = errno;
- // Failure is possible for locked pages.
- Y_ABORT_UNLESS(error == EINVAL);
- }
- }
- void DoMadviseFree(void* ptr, size_t size)
- {
- TTimingGuard timingGuard(ETimingEventType::MadviseFree, size);
- auto result = ::madvise(ptr, size, MADV_FREE);
- if (result != 0) {
- auto error = errno;
- // Failure is possible for locked pages.
- YTALLOC_VERIFY(error == EINVAL);
- }
- }
- bool TryMadviseStockpile(size_t size)
- {
- auto result = ::madvise(nullptr, size, MADV_STOCKPILE);
- if (result != 0) {
- auto error = errno;
- if (error == ENOMEM || error == EAGAIN || error == EINTR) {
- // The call is advisory, ignore ENOMEM, EAGAIN, and EINTR.
- return true;
- }
- YTALLOC_VERIFY(error == EINVAL);
- return false;
- }
- return true;
- }
- // Some kernels are known to contain bugs in MADV_FREE; see https://st.yandex-team.ru/KERNEL-118.
- bool IsBuggyKernel()
- {
- #ifdef _linux_
- static const bool result = [] () {
- struct utsname buf;
- YTALLOC_VERIFY(uname(&buf) == 0);
- if (strverscmp(buf.release, "4.4.1-1") >= 0 &&
- strverscmp(buf.release, "4.4.96-44") < 0)
- {
- return true;
- }
- if (strverscmp(buf.release, "4.14.1-1") >= 0 &&
- strverscmp(buf.release, "4.14.79-33") < 0)
- {
- return true;
- }
- return false;
- }();
- return result;
- #else
- return false;
- #endif
- }
- };
- TExplicitlyConstructableSingleton<TMappedMemoryManager> MappedMemoryManager;
- ////////////////////////////////////////////////////////////////////////////////
- // System allocator
- // Each system allocation is prepended with such a header.
- struct TSystemBlobHeader
- {
- explicit TSystemBlobHeader(size_t size)
- : Size(size)
- { }
- size_t Size;
- char Padding[8];
- };
- CHECK_HEADER_ALIGNMENT(TSystemBlobHeader)
- // Used for some internal allocations.
- // Delgates directly to TMappedMemoryManager.
- class TSystemAllocator
- {
- public:
- void* Allocate(size_t size);
- void Free(void* ptr);
- private:
- std::atomic<uintptr_t> CurrentPtr_ = SystemZoneStart;
- };
- TExplicitlyConstructableSingleton<TSystemAllocator> SystemAllocator;
- ////////////////////////////////////////////////////////////////////////////////
- // Deriving from this class makes instances bound to TSystemAllocator.
- struct TSystemAllocatable
- {
- void* operator new(size_t size) noexcept
- {
- return SystemAllocator->Allocate(size);
- }
- void* operator new[](size_t size) noexcept
- {
- return SystemAllocator->Allocate(size);
- }
- void operator delete(void* ptr) noexcept
- {
- SystemAllocator->Free(ptr);
- }
- void operator delete[](void* ptr) noexcept
- {
- SystemAllocator->Free(ptr);
- }
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Maintains a pool of objects.
- // Objects are allocated in groups each containing BatchSize instances.
- // The actual allocation is carried out by TSystemAllocator.
- // Memory is never actually reclaimed; freed instances are put into TFreeList.
- template <class T, size_t BatchSize>
- class TSystemPool
- {
- public:
- T* Allocate()
- {
- while (true) {
- auto* obj = FreeList_.Extract();
- if (Y_LIKELY(obj)) {
- new (obj) T();
- return obj;
- }
- AllocateMore();
- }
- }
- void Free(T* obj)
- {
- obj->T::~T();
- PoisonFreedRange(obj, sizeof(T));
- FreeList_.Put(obj);
- }
- private:
- TFreeList<T> FreeList_;
- private:
- void AllocateMore()
- {
- auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
- for (size_t index = 0; index < BatchSize; ++index) {
- auto* obj = objs + index;
- FreeList_.Put(obj);
- }
- }
- };
- // A sharded analogue TSystemPool.
- template <class T, size_t BatchSize>
- class TShardedSystemPool
- {
- public:
- template <class TState>
- T* Allocate(TState* state)
- {
- if (auto* obj = FreeLists_[state->GetInitialShardIndex()].Extract()) {
- new (obj) T();
- return obj;
- }
- while (true) {
- for (size_t index = 0; index < ShardCount; ++index) {
- if (auto* obj = FreeLists_[state->GetNextShardIndex()].Extract()) {
- new (obj) T();
- return obj;
- }
- }
- AllocateMore();
- }
- }
- template <class TState>
- void Free(TState* state, T* obj)
- {
- obj->T::~T();
- PoisonFreedRange(obj, sizeof(T));
- FreeLists_[state->GetInitialShardIndex()].Put(obj);
- }
- private:
- std::array<TFreeList<T>, ShardCount> FreeLists_;
- private:
- void AllocateMore()
- {
- auto* objs = static_cast<T*>(SystemAllocator->Allocate(sizeof(T) * BatchSize));
- for (size_t index = 0; index < BatchSize; ++index) {
- auto* obj = objs + index;
- FreeLists_[index % ShardCount].Put(obj);
- }
- }
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Handles allocations inside a zone of memory given by its start and end pointers.
- // Each allocation is a separate mapped region of memory.
- // A special care is taken to guarantee that all allocated regions fall inside the zone.
- class TZoneAllocator
- {
- public:
- TZoneAllocator(uintptr_t zoneStart, uintptr_t zoneEnd)
- : ZoneStart_(zoneStart)
- , ZoneEnd_(zoneEnd)
- , Current_(zoneStart)
- {
- YTALLOC_VERIFY(ZoneStart_ % PageSize == 0);
- }
- void* Allocate(size_t size, int flags)
- {
- YTALLOC_VERIFY(size % PageSize == 0);
- bool restarted = false;
- while (true) {
- auto hint = (Current_ += size) - size;
- if (reinterpret_cast<uintptr_t>(hint) + size > ZoneEnd_) {
- if (restarted) {
- ::fprintf(stderr, "*** YTAlloc was unable to mmap %zu bytes in zone %" PRIx64 "--%" PRIx64 "\n",
- size,
- ZoneStart_,
- ZoneEnd_);
- OomTrap();
- }
- restarted = true;
- Current_ = ZoneStart_;
- } else {
- char* ptr = static_cast<char*>(MappedMemoryManager->Map(
- hint,
- size,
- MAP_FIXED_NOREPLACE | flags));
- if (reinterpret_cast<uintptr_t>(ptr) == hint) {
- return ptr;
- }
- if (ptr != MAP_FAILED) {
- MappedMemoryManager->Unmap(ptr, size);
- }
- }
- }
- }
- void Free(void* ptr, size_t size)
- {
- MappedMemoryManager->Unmap(ptr, size);
- }
- private:
- const uintptr_t ZoneStart_;
- const uintptr_t ZoneEnd_;
- std::atomic<uintptr_t> Current_;
- };
- ////////////////////////////////////////////////////////////////////////////////
- // YTAlloc supports tagged allocations.
- // Since the total number of tags can be huge, a two-level scheme is employed.
- // Possible tags are arranged into sets each containing TaggedCounterSetSize tags.
- // There are up to MaxTaggedCounterSets in total.
- // Upper 4 sets are reserved for profiled allocations.
- constexpr size_t TaggedCounterSetSize = 16384;
- constexpr size_t AllocationProfilingTaggedCounterSets = 4;
- constexpr size_t MaxTaggedCounterSets = 256 + AllocationProfilingTaggedCounterSets;
- constexpr size_t MaxCapturedAllocationBacktraces = 65000;
- static_assert(
- MaxCapturedAllocationBacktraces < AllocationProfilingTaggedCounterSets * TaggedCounterSetSize,
- "MaxCapturedAllocationBacktraces is too big");
- constexpr TMemoryTag AllocationProfilingMemoryTagBase = TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets);
- constexpr TMemoryTag AllocationProfilingUnknownMemoryTag = AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
- static_assert(
- MaxMemoryTag == TaggedCounterSetSize * (MaxTaggedCounterSets - AllocationProfilingTaggedCounterSets) - 1,
- "Wrong MaxMemoryTag");
- template <class TCounter>
- using TUntaggedTotalCounters = TEnumIndexedArray<EBasicCounter, TCounter>;
- template <class TCounter>
- struct TTaggedTotalCounterSet
- : public TSystemAllocatable
- {
- std::array<TEnumIndexedArray<EBasicCounter, TCounter>, TaggedCounterSetSize> Counters;
- };
- using TLocalTaggedBasicCounterSet = TTaggedTotalCounterSet<ssize_t>;
- using TGlobalTaggedBasicCounterSet = TTaggedTotalCounterSet<std::atomic<ssize_t>>;
- template <class TCounter>
- struct TTotalCounters
- {
- // The sum of counters across all tags.
- TUntaggedTotalCounters<TCounter> CumulativeTaggedCounters;
- // Counters for untagged allocations.
- TUntaggedTotalCounters<TCounter> UntaggedCounters;
- // Access to tagged counters may involve creation of a new tag set.
- // For simplicity, we separate the read-side (TaggedCounterSets) and the write-side (TaggedCounterSetHolders).
- // These arrays contain virtually identical data (up to std::unique_ptr and std::atomic semantic differences).
- std::array<std::atomic<TTaggedTotalCounterSet<TCounter>*>, MaxTaggedCounterSets> TaggedCounterSets{};
- std::array<std::unique_ptr<TTaggedTotalCounterSet<TCounter>>, MaxTaggedCounterSets> TaggedCounterSetHolders;
- // Protects TaggedCounterSetHolders from concurrent updates.
- NThreading::TForkAwareSpinLock TaggedCounterSetsLock;
- // Returns null if the set is not yet constructed.
- Y_FORCE_INLINE TTaggedTotalCounterSet<TCounter>* FindTaggedCounterSet(size_t index) const
- {
- return TaggedCounterSets[index].load();
- }
- // Constructs the set on first access.
- TTaggedTotalCounterSet<TCounter>* GetOrCreateTaggedCounterSet(size_t index)
- {
- auto* set = TaggedCounterSets[index].load();
- if (Y_LIKELY(set)) {
- return set;
- }
- auto guard = GuardWithTiming(TaggedCounterSetsLock);
- auto& setHolder = TaggedCounterSetHolders[index];
- if (!setHolder) {
- setHolder = std::make_unique<TTaggedTotalCounterSet<TCounter>>();
- TaggedCounterSets[index] = setHolder.get();
- }
- return setHolder.get();
- }
- };
- using TLocalSystemCounters = TEnumIndexedArray<ESystemCounter, ssize_t>;
- using TGlobalSystemCounters = TEnumIndexedArray<ESystemCounter, std::atomic<ssize_t>>;
- using TLocalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, ssize_t>;
- using TGlobalSmallCounters = TEnumIndexedArray<ESmallArenaCounter, std::atomic<ssize_t>>;
- using TLocalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, ssize_t>;
- using TGlobalLargeCounters = TEnumIndexedArray<ELargeArenaCounter, std::atomic<ssize_t>>;
- using TLocalHugeCounters = TEnumIndexedArray<EHugeCounter, ssize_t>;
- using TGlobalHugeCounters = TEnumIndexedArray<EHugeCounter, std::atomic<ssize_t>>;
- using TLocalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, ssize_t>;
- using TGlobalUndumpableCounters = TEnumIndexedArray<EUndumpableCounter, std::atomic<ssize_t>>;
- Y_FORCE_INLINE ssize_t LoadCounter(ssize_t counter)
- {
- return counter;
- }
- Y_FORCE_INLINE ssize_t LoadCounter(const std::atomic<ssize_t>& counter)
- {
- return counter.load();
- }
- ////////////////////////////////////////////////////////////////////////////////
- struct TMmapObservationEvent
- {
- size_t Size;
- std::array<void*, MaxAllocationProfilingBacktraceDepth> Frames;
- int FrameCount;
- TInstant Timestamp;
- TFiberId FiberId;
- TMmapObservationEvent() = default;
- TMmapObservationEvent(
- size_t size,
- std::array<void*, MaxAllocationProfilingBacktraceDepth> frames,
- int frameCount)
- : Size(size)
- , Frames(frames)
- , FrameCount(frameCount)
- { }
- };
- class TMmapObservationManager
- : public TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>
- {
- public:
- void SetBacktraceFormatter(TBacktraceFormatter formatter)
- {
- BacktraceFormatter_.store(formatter);
- }
- private:
- std::atomic<TBacktraceFormatter> BacktraceFormatter_ = nullptr;
- Y_POD_STATIC_THREAD(bool) DisabledForCurrentThread_;
- friend class TEventLogManagerBase<TMmapObservationEvent, TMmapObservationManager>;
- virtual void OnEvent(const TMmapObservationEvent& /*event*/) override
- { }
- virtual void ProcessEvent(const TMmapObservationEvent& event) override
- {
- YTALLOC_LOG_DEBUG("Large arena mmap observed (Size: %zu, Timestamp: %s, FiberId: %" PRIx64 ")",
- event.Size,
- ToString(event.Timestamp).c_str(),
- event.FiberId);
- if (auto backtraceFormatter = BacktraceFormatter_.load()) {
- auto backtrace = backtraceFormatter(const_cast<void**>(event.Frames.data()), event.FrameCount);
- YTALLOC_LOG_DEBUG("YTAlloc stack backtrace (Stack: %s)",
- backtrace.c_str());
- }
- }
- };
- Y_POD_THREAD(bool) TMmapObservationManager::DisabledForCurrentThread_;
- TExplicitlyConstructableSingleton<TMmapObservationManager> MmapObservationManager;
- ////////////////////////////////////////////////////////////////////////////////
- // A per-thread structure containing counters, chunk caches etc.
- struct TThreadState
- : public TFreeListItemBase<TThreadState>
- , public TLocalShardedState
- {
- // TThreadState instances of all alive threads are put into a double-linked intrusive list.
- // This is a pair of next/prev pointers connecting an instance of TThreadState to its neighbors.
- TIntrusiveLinkedListNode<TThreadState> RegistryNode;
- // Pointers to the respective parts of TThreadManager::ThreadControlWord_.
- // If null then the thread is already destroyed (but TThreadState may still live for a while
- // due to ref-counting).
- ui8* AllocationProfilingEnabled;
- ui8* BackgroundThreadStarted;
- // TThreadStates are ref-counted.
- // TThreadManager::EnumerateThreadStates enumerates the registered states and acquires
- // a temporary reference preventing these states from being destructed. This provides
- // for shorter periods of time the global lock needs to be held.
- int RefCounter = 1;
- // Per-thread counters.
- TTotalCounters<ssize_t> TotalCounters;
- std::array<TLocalLargeCounters, LargeRankCount> LargeArenaCounters;
- TLocalUndumpableCounters UndumpableCounters;
- // Each thread maintains caches of small chunks.
- // One cache is for tagged chunks; the other is for untagged ones.
- // Each cache contains up to MaxCachedChunksPerRank chunks per any rank.
- // Special sentinels are placed to distinguish the boundaries of region containing
- // pointers of a specific rank. This enables a tiny-bit faster inplace boundary checks.
- static constexpr uintptr_t LeftSentinel = 1;
- static constexpr uintptr_t RightSentinel = 2;
- struct TSmallBlobCache
- {
- TSmallBlobCache()
- {
- void** chunkPtrs = CachedChunks.data();
- for (size_t rank = 0; rank < SmallRankCount; ++rank) {
- RankToCachedChunkPtrHead[rank] = chunkPtrs;
- chunkPtrs[0] = reinterpret_cast<void*>(LeftSentinel);
- chunkPtrs[MaxCachedChunksPerRank + 1] = reinterpret_cast<void*>(RightSentinel);
- #ifdef YTALLOC_PARANOID
- RankToCachedChunkPtrTail[rank] = chunkPtrs;
- CachedChunkFull[rank] = false;
- RankToCachedChunkLeftBorder[rank] = chunkPtrs;
- RankToCachedChunkRightBorder[rank] = chunkPtrs + MaxCachedChunksPerRank + 1;
- #endif
- chunkPtrs += MaxCachedChunksPerRank + 2;
- }
- }
- // For each rank we have a segment of pointers in CachedChunks with the following layout:
- // LCC[C]........R
- // Legend:
- // . = garbage
- // L = left sentinel
- // R = right sentinel
- // C = cached pointer
- // [C] = current cached pointer
- //
- // Under YTALLOC_PARANOID the following layout is used:
- // L.[T]CCC[H]...R
- // Legend:
- // [H] = head cached pointer, put chunks here
- // [T] = tail cached pointer, take chunks from here
- // +2 is for two sentinels
- std::array<void*, SmallRankCount * (MaxCachedChunksPerRank + 2)> CachedChunks{};
- // Pointer to [P] for each rank.
- std::array<void**, SmallRankCount> RankToCachedChunkPtrHead{};
- #ifdef YTALLOC_PARANOID
- // Pointers to [L] and [R] for each rank.
- std::array<void**, SmallRankCount> RankToCachedChunkLeftBorder{};
- std::array<void**, SmallRankCount> RankToCachedChunkRightBorder{};
- std::array<void**, SmallRankCount> RankToCachedChunkPtrTail{};
- std::array<bool, SmallRankCount> CachedChunkFull{};
- #endif
- };
- TEnumIndexedArray<EAllocationKind, TSmallBlobCache> SmallBlobCache;
- };
- struct TThreadStateToRegistryNode
- {
- auto operator() (TThreadState* state) const
- {
- return &state->RegistryNode;
- }
- };
- // Manages all registered threads and controls access to TThreadState.
- class TThreadManager
- {
- public:
- TThreadManager()
- {
- pthread_key_create(&ThreadDtorKey_, DestroyThread);
- NThreading::RegisterAtForkHandlers(
- nullptr,
- nullptr,
- [=] { AfterFork(); });
- }
- // Returns TThreadState for the current thread; the caller guarantees that this
- // state is initialized and is not destroyed yet.
- static TThreadState* GetThreadStateUnchecked();
- // Returns TThreadState for the current thread; may return null.
- static TThreadState* FindThreadState();
- // Returns TThreadState for the current thread; may not return null
- // (but may crash if TThreadState is already destroyed).
- static TThreadState* GetThreadStateChecked()
- {
- auto* state = FindThreadState();
- YTALLOC_VERIFY(state);
- return state;
- }
- // Enumerates all threads and invokes func passing TThreadState instances.
- // func must not throw but can take arbitrary time; no locks are being held while it executes.
- template <class THandler>
- void EnumerateThreadStatesAsync(const THandler& handler) noexcept
- {
- TMemoryTagGuard guard(NullMemoryTag);
- std::vector<TThreadState*> states;
- states.reserve(1024); // must be enough in most cases
- auto unrefStates = [&] {
- // Releasing references also requires global lock to be held to avoid getting zombies above.
- auto guard = GuardWithTiming(ThreadRegistryLock_);
- for (auto* state : states) {
- UnrefThreadState(state);
- }
- };
- auto tryRefStates = [&] {
- // Only hold this guard for a small period of time to reference all the states.
- auto guard = GuardWithTiming(ThreadRegistryLock_);
- auto* current = ThreadRegistry_.GetFront();
- while (current) {
- if (states.size() == states.capacity()) {
- // Cannot allocate while holding ThreadRegistryLock_ due to a possible deadlock as follows:
- // EnumerateThreadStatesAsync -> StartBackgroundThread -> EnumerateThreadStatesSync
- // (many other scenarios are also possible).
- guard.Release();
- unrefStates();
- states.clear();
- states.reserve(states.capacity() * 2);
- return false;
- }
- RefThreadState(current);
- states.push_back(current);
- current = current->RegistryNode.Next;
- }
- return true;
- };
- while (!tryRefStates()) ;
- for (auto* state : states) {
- handler(state);
- }
- unrefStates();
- }
- // Similar to EnumerateThreadStatesAsync but holds the global lock while enumerating the threads.
- // Also invokes a given prologue functor while holding the thread registry lock.
- // Handler and prologue calls must be fast and must not allocate.
- template <class TPrologue, class THandler>
- void EnumerateThreadStatesSync(const TPrologue& prologue, const THandler& handler) noexcept
- {
- auto guard = GuardWithTiming(ThreadRegistryLock_);
- prologue();
- auto* current = ThreadRegistry_.GetFront();
- while (current) {
- handler(current);
- current = current->RegistryNode.Next;
- }
- }
- // We store a special 64-bit "thread control word" in TLS encapsulating the following
- // crucial per-thread parameters:
- // * the current memory tag
- // * a flag indicating that a valid TThreadState is known to exists
- // (and can be obtained via GetThreadStateUnchecked)
- // * a flag indicating that allocation profiling is enabled
- // * a flag indicating that background thread is started
- // Thread control word is fetched via GetThreadControlWord and is compared
- // against FastPathControlWord to see if the fast path can be taken.
- // The latter happens when no memory tagging is configured, TThreadState is
- // valid, allocation profiling is disabled, and background thread is started.
- // The mask for extracting memory tag from thread control word.
- static constexpr ui64 MemoryTagControlWordMask = 0xffffffff;
- // ThreadStateValid is on.
- static constexpr ui64 ThreadStateValidControlWordMask = (1ULL << 32);
- // AllocationProfiling is on.
- static constexpr ui64 AllocationProfilingEnabledControlWordMask = (1ULL << 40);
- // All background thread are properly started.
- static constexpr ui64 BackgroundThreadStartedControlWorkMask = (1ULL << 48);
- // Memory tag is NullMemoryTag; thread state is valid.
- static constexpr ui64 FastPathControlWord =
- BackgroundThreadStartedControlWorkMask |
- ThreadStateValidControlWordMask |
- NullMemoryTag;
- Y_FORCE_INLINE static ui64 GetThreadControlWord()
- {
- return (&ThreadControlWord_)->Value;
- }
- static TMemoryTag GetCurrentMemoryTag()
- {
- return (&ThreadControlWord_)->Parts.MemoryTag;
- }
- static void SetCurrentMemoryTag(TMemoryTag tag)
- {
- Y_ABORT_UNLESS(tag <= MaxMemoryTag);
- (&ThreadControlWord_)->Parts.MemoryTag = tag;
- }
- static EMemoryZone GetCurrentMemoryZone()
- {
- return CurrentMemoryZone_;
- }
- static void SetCurrentMemoryZone(EMemoryZone zone)
- {
- CurrentMemoryZone_ = zone;
- }
- static void SetCurrentFiberId(TFiberId id)
- {
- CurrentFiberId_ = id;
- }
- static TFiberId GetCurrentFiberId()
- {
- return CurrentFiberId_;
- }
- private:
- static void DestroyThread(void*);
- TThreadState* AllocateThreadState();
- void RefThreadState(TThreadState* state)
- {
- auto result = ++state->RefCounter;
- Y_ABORT_UNLESS(result > 1);
- }
- void UnrefThreadState(TThreadState* state)
- {
- auto result = --state->RefCounter;
- Y_ABORT_UNLESS(result >= 0);
- if (result == 0) {
- DestroyThreadState(state);
- }
- }
- void DestroyThreadState(TThreadState* state);
- void AfterFork();
- private:
- // TThreadState instance for the current thread.
- // Initially null, then initialized when first needed.
- // TThreadState is destroyed upon thread termination (which is detected with
- // the help of pthread_key_create machinery), so this pointer can become null again.
- Y_POD_STATIC_THREAD(TThreadState*) ThreadState_;
- // Initially false, then set to true then TThreadState is destroyed.
- // If the thread requests for its state afterwards, null is returned and no new state is (re-)created.
- // The caller must be able to deal with it.
- Y_POD_STATIC_THREAD(bool) ThreadStateDestroyed_;
- union TThreadControlWord
- {
- ui64 __attribute__((__may_alias__)) Value;
- struct TParts
- {
- // The current memory tag used in all allocations by this thread.
- ui32 __attribute__((__may_alias__)) MemoryTag;
- // Indicates if a valid TThreadState exists and can be obtained via GetThreadStateUnchecked.
- ui8 __attribute__((__may_alias__)) ThreadStateValid;
- // Indicates if allocation profiling is on.
- ui8 __attribute__((__may_alias__)) AllocationProfilingEnabled;
- // Indicates if all background threads are properly started.
- ui8 __attribute__((__may_alias__)) BackgroundThreadStarted;
- ui8 Padding[2];
- } Parts;
- };
- Y_POD_STATIC_THREAD(TThreadControlWord) ThreadControlWord_;
- // See memory zone API.
- Y_POD_STATIC_THREAD(EMemoryZone) CurrentMemoryZone_;
- // See fiber id API.
- Y_POD_STATIC_THREAD(TFiberId) CurrentFiberId_;
- pthread_key_t ThreadDtorKey_;
- static constexpr size_t ThreadStatesBatchSize = 1;
- TSystemPool<TThreadState, ThreadStatesBatchSize> ThreadStatePool_;
- NThreading::TForkAwareSpinLock ThreadRegistryLock_;
- TIntrusiveLinkedList<TThreadState, TThreadStateToRegistryNode> ThreadRegistry_;
- };
- Y_POD_THREAD(TThreadState*) TThreadManager::ThreadState_;
- Y_POD_THREAD(bool) TThreadManager::ThreadStateDestroyed_;
- Y_POD_THREAD(TThreadManager::TThreadControlWord) TThreadManager::ThreadControlWord_;
- Y_POD_THREAD(EMemoryZone) TThreadManager::CurrentMemoryZone_;
- Y_POD_THREAD(TFiberId) TThreadManager::CurrentFiberId_;
- TExplicitlyConstructableSingleton<TThreadManager> ThreadManager;
- ////////////////////////////////////////////////////////////////////////////////
- void TConfigurationManager::SetAllocationProfilingEnabled(bool value)
- {
- // Update threads' TLS.
- ThreadManager->EnumerateThreadStatesSync(
- [&] {
- AllocationProfilingEnabled_.store(value);
- },
- [&] (auto* state) {
- if (state->AllocationProfilingEnabled) {
- *state->AllocationProfilingEnabled = value;
- }
- });
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Backtrace Manager
- //
- // Captures backtraces observed during allocations and assigns memory tags to them.
- // Memory tags are chosen sequentially starting from AllocationProfilingMemoryTagBase.
- //
- // For each backtrace we compute a 64-bit hash and use it as a key in a certain concurrent hashmap.
- // This hashmap is organized into BucketCount buckets, each consisting of BucketSize slots.
- //
- // Backtrace hash is translated into bucket index by taking the appropriate number of
- // its lower bits. For each slot, we remember a 32-bit fingerprint, which is
- // just the next 32 bits of the backtrace's hash, and the (previously assigned) memory tag.
- //
- // Upon access to the hashtable, the bucket is first scanned optimistically, without taking
- // any locks. In case of a miss, a per-bucket spinlock is acquired and the bucket is rescanned.
- //
- // The above scheme may involve collisions but we neglect their probability.
- //
- // If the whole hash table overflows (i.e. a total of MaxCapturedAllocationBacktraces
- // backtraces are captured) or the bucket overflows (i.e. all of its slots become occupied),
- // the allocation is annotated with AllocationProfilingUnknownMemoryTag. Such allocations
- // appear as having no backtrace whatsoever in the profiling reports.
- class TBacktraceManager
- {
- public:
- // Sets the provider used for collecting backtraces when allocation profiling
- // is turned ON.
- void SetBacktraceProvider(TBacktraceProvider provider)
- {
- BacktraceProvider_.store(provider);
- }
- // Captures the backtrace and inserts it into the hashtable.
- TMemoryTag GetMemoryTagFromBacktrace(int framesToSkip)
- {
- std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
- auto backtraceProvider = BacktraceProvider_.load();
- if (!backtraceProvider) {
- return NullMemoryTag;
- }
- auto frameCount = backtraceProvider(frames.data(), ConfigurationManager->GetProfilingBacktraceDepth(), framesToSkip);
- auto hash = GetBacktraceHash(frames.data(), frameCount);
- return CaptureBacktrace(hash, frames.data(), frameCount);
- }
- // Returns the backtrace corresponding to the given tag, if any.
- std::optional<TBacktrace> FindBacktrace(TMemoryTag tag)
- {
- if (tag < AllocationProfilingMemoryTagBase ||
- tag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces)
- {
- return std::nullopt;
- }
- const auto& entry = Backtraces_[tag - AllocationProfilingMemoryTagBase];
- if (!entry.Captured.load()) {
- return std::nullopt;
- }
- return entry.Backtrace;
- }
- private:
- static constexpr int Log2BucketCount = 16;
- static constexpr int BucketCount = 1 << Log2BucketCount;
- static constexpr int BucketSize = 8;
- std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
- std::array<std::array<std::atomic<ui32>, BucketSize>, BucketCount> Fingerprints_= {};
- std::array<std::array<std::atomic<TMemoryTag>, BucketSize>, BucketCount> MemoryTags_ = {};
- std::array<NThreading::TForkAwareSpinLock, BucketCount> BucketLocks_;
- std::atomic<TMemoryTag> CurrentMemoryTag_ = AllocationProfilingMemoryTagBase;
- struct TBacktraceEntry
- {
- TBacktrace Backtrace;
- std::atomic<bool> Captured = false;
- };
- std::array<TBacktraceEntry, MaxCapturedAllocationBacktraces> Backtraces_;
- private:
- static size_t GetBacktraceHash(void** frames, int frameCount)
- {
- size_t hash = 0;
- for (int index = 0; index < frameCount; ++index) {
- hash = CombineHashes(hash, THash<void*>()(frames[index]));
- }
- return hash;
- }
- TMemoryTag CaptureBacktrace(size_t hash, void** frames, int frameCount)
- {
- size_t bucketIndex = hash % BucketCount;
- ui32 fingerprint = (hash >> Log2BucketCount) & 0xffffffff;
- // Zero fingerprint indicates the slot is free; check and adjust to ensure
- // that regular fingerprints are non-zero.
- if (fingerprint == 0) {
- fingerprint = 1;
- }
- for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
- auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
- if (currentFingerprint == fingerprint) {
- return MemoryTags_[bucketIndex][slotIndex].load();
- }
- }
- auto guard = Guard(BucketLocks_[bucketIndex]);
- int spareSlotIndex = -1;
- for (int slotIndex = 0; slotIndex < BucketSize; ++slotIndex) {
- auto currentFingerprint = Fingerprints_[bucketIndex][slotIndex].load(std::memory_order_relaxed);
- if (currentFingerprint == fingerprint) {
- return MemoryTags_[bucketIndex][slotIndex];
- }
- if (currentFingerprint == 0) {
- spareSlotIndex = slotIndex;
- break;
- }
- }
- if (spareSlotIndex < 0) {
- return AllocationProfilingUnknownMemoryTag;
- }
- auto memoryTag = CurrentMemoryTag_++;
- if (memoryTag >= AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces) {
- return AllocationProfilingUnknownMemoryTag;
- }
- MemoryTags_[bucketIndex][spareSlotIndex].store(memoryTag);
- Fingerprints_[bucketIndex][spareSlotIndex].store(fingerprint);
- auto& entry = Backtraces_[memoryTag - AllocationProfilingMemoryTagBase];
- entry.Backtrace.FrameCount = frameCount;
- ::memcpy(entry.Backtrace.Frames.data(), frames, sizeof (void*) * frameCount);
- entry.Captured.store(true);
- return memoryTag;
- }
- };
- TExplicitlyConstructableSingleton<TBacktraceManager> BacktraceManager;
- ////////////////////////////////////////////////////////////////////////////////
- // Mimics the counters of TThreadState but uses std::atomic to survive concurrent access.
- struct TGlobalState
- : public TGlobalShardedState
- {
- TTotalCounters<std::atomic<ssize_t>> TotalCounters;
- std::array<TGlobalLargeCounters, LargeRankCount> LargeArenaCounters;
- TGlobalUndumpableCounters UndumpableCounters;
- };
- TExplicitlyConstructableSingleton<TGlobalState> GlobalState;
- ////////////////////////////////////////////////////////////////////////////////
- // Accumulates various allocation statistics.
- class TStatisticsManager
- {
- public:
- template <EAllocationKind Kind = EAllocationKind::Tagged, class TState>
- static Y_FORCE_INLINE void IncrementTotalCounter(TState* state, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
- {
- // This branch is typically resolved at compile time.
- if (Kind == EAllocationKind::Tagged && tag != NullMemoryTag) {
- IncrementTaggedTotalCounter(&state->TotalCounters, tag, counter, delta);
- } else {
- IncrementUntaggedTotalCounter(&state->TotalCounters, counter, delta);
- }
- }
- static Y_FORCE_INLINE void IncrementTotalCounter(TMemoryTag tag, EBasicCounter counter, ssize_t delta)
- {
- IncrementTotalCounter(GlobalState.Get(), tag, counter, delta);
- }
- void IncrementSmallArenaCounter(ESmallArenaCounter counter, size_t rank, ssize_t delta)
- {
- SmallArenaCounters_[rank][counter] += delta;
- }
- template <class TState>
- static Y_FORCE_INLINE void IncrementLargeArenaCounter(TState* state, size_t rank, ELargeArenaCounter counter, ssize_t delta)
- {
- state->LargeArenaCounters[rank][counter] += delta;
- }
- template <class TState>
- static Y_FORCE_INLINE void IncrementUndumpableCounter(TState* state, EUndumpableCounter counter, ssize_t delta)
- {
- state->UndumpableCounters[counter] += delta;
- }
- void IncrementHugeCounter(EHugeCounter counter, ssize_t delta)
- {
- HugeCounters_[counter] += delta;
- }
- void IncrementHugeUndumpableCounter(EUndumpableCounter counter, ssize_t delta)
- {
- HugeUndumpableCounters_[counter] += delta;
- }
- void IncrementSystemCounter(ESystemCounter counter, ssize_t delta)
- {
- SystemCounters_[counter] += delta;
- }
- // Computes memory usage for a list of tags by aggregating counters across threads.
- void GetTaggedMemoryCounters(const TMemoryTag* tags, size_t count, TEnumIndexedArray<EBasicCounter, ssize_t>* counters)
- {
- TMemoryTagGuard guard(NullMemoryTag);
- for (size_t index = 0; index < count; ++index) {
- counters[index][EBasicCounter::BytesAllocated] = 0;
- counters[index][EBasicCounter::BytesFreed] = 0;
- }
- for (size_t index = 0; index < count; ++index) {
- auto tag = tags[index];
- counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesAllocated);
- counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(GlobalState->TotalCounters, tag, EBasicCounter::BytesFreed);
- }
- ThreadManager->EnumerateThreadStatesAsync(
- [&] (const auto* state) {
- for (size_t index = 0; index < count; ++index) {
- auto tag = tags[index];
- counters[index][EBasicCounter::BytesAllocated] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesAllocated);
- counters[index][EBasicCounter::BytesFreed] += LoadTaggedTotalCounter(state->TotalCounters, tag, EBasicCounter::BytesFreed);
- }
- });
- for (size_t index = 0; index < count; ++index) {
- counters[index][EBasicCounter::BytesUsed] = GetUsed(counters[index][EBasicCounter::BytesAllocated], counters[index][EBasicCounter::BytesFreed]);
- }
- }
- void GetTaggedMemoryUsage(const TMemoryTag* tags, size_t count, size_t* results)
- {
- TMemoryTagGuard guard(NullMemoryTag);
- std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
- counters.resize(count);
- GetTaggedMemoryCounters(tags, count, counters.data());
- for (size_t index = 0; index < count; ++index) {
- results[index] = counters[index][EBasicCounter::BytesUsed];
- }
- }
- TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
- {
- TEnumIndexedArray<ETotalCounter, ssize_t> result;
- auto accumulate = [&] (const auto& counters) {
- result[ETotalCounter::BytesAllocated] += LoadCounter(counters[EBasicCounter::BytesAllocated]);
- result[ETotalCounter::BytesFreed] += LoadCounter(counters[EBasicCounter::BytesFreed]);
- };
- accumulate(GlobalState->TotalCounters.UntaggedCounters);
- accumulate(GlobalState->TotalCounters.CumulativeTaggedCounters);
- ThreadManager->EnumerateThreadStatesAsync(
- [&] (const auto* state) {
- accumulate(state->TotalCounters.UntaggedCounters);
- accumulate(state->TotalCounters.CumulativeTaggedCounters);
- });
- result[ETotalCounter::BytesUsed] = GetUsed(
- result[ETotalCounter::BytesAllocated],
- result[ETotalCounter::BytesFreed]);
- auto systemCounters = GetSystemAllocationCounters();
- result[ETotalCounter::BytesCommitted] += systemCounters[EBasicCounter::BytesUsed];
- auto hugeCounters = GetHugeAllocationCounters();
- result[ETotalCounter::BytesCommitted] += hugeCounters[EHugeCounter::BytesUsed];
- auto smallArenaCounters = GetSmallArenaAllocationCounters();
- for (size_t rank = 0; rank < SmallRankCount; ++rank) {
- result[ETotalCounter::BytesCommitted] += smallArenaCounters[rank][ESmallArenaCounter::BytesCommitted];
- }
- auto largeArenaCounters = GetLargeArenaAllocationCounters();
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- result[ETotalCounter::BytesCommitted] += largeArenaCounters[rank][ELargeArenaCounter::BytesCommitted];
- }
- result[ETotalCounter::BytesUnaccounted] = std::max<ssize_t>(GetProcessRss() - result[ETotalCounter::BytesCommitted], 0);
- return result;
- }
- TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
- {
- TEnumIndexedArray<ESmallCounter, ssize_t> result;
- auto totalCounters = GetTotalAllocationCounters();
- result[ESmallCounter::BytesAllocated] = totalCounters[ETotalCounter::BytesAllocated];
- result[ESmallCounter::BytesFreed] = totalCounters[ETotalCounter::BytesFreed];
- result[ESmallCounter::BytesUsed] = totalCounters[ETotalCounter::BytesUsed];
- auto largeArenaCounters = GetLargeArenaAllocationCounters();
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- result[ESmallCounter::BytesAllocated] -= largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
- result[ESmallCounter::BytesFreed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
- result[ESmallCounter::BytesUsed] -= largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
- }
- auto hugeCounters = GetHugeAllocationCounters();
- result[ESmallCounter::BytesAllocated] -= hugeCounters[EHugeCounter::BytesAllocated];
- result[ESmallCounter::BytesFreed] -= hugeCounters[EHugeCounter::BytesFreed];
- result[ESmallCounter::BytesUsed] -= hugeCounters[EHugeCounter::BytesUsed];
- return result;
- }
- std::array<TLocalSmallCounters, SmallRankCount> GetSmallArenaAllocationCounters()
- {
- std::array<TLocalSmallCounters, SmallRankCount> result;
- for (size_t rank = 0; rank < SmallRankCount; ++rank) {
- for (auto counter : TEnumTraits<ESmallArenaCounter>::GetDomainValues()) {
- result[rank][counter] = SmallArenaCounters_[rank][counter].load();
- }
- }
- return result;
- }
- TEnumIndexedArray<ELargeCounter, ssize_t> GetLargeAllocationCounters()
- {
- TEnumIndexedArray<ELargeCounter, ssize_t> result;
- auto largeArenaCounters = GetLargeArenaAllocationCounters();
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- result[ESmallCounter::BytesAllocated] += largeArenaCounters[rank][ELargeArenaCounter::BytesAllocated];
- result[ESmallCounter::BytesFreed] += largeArenaCounters[rank][ELargeArenaCounter::BytesFreed];
- result[ESmallCounter::BytesUsed] += largeArenaCounters[rank][ELargeArenaCounter::BytesUsed];
- }
- return result;
- }
- std::array<TLocalLargeCounters, LargeRankCount> GetLargeArenaAllocationCounters()
- {
- std::array<TLocalLargeCounters, LargeRankCount> result{};
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
- result[rank][counter] = GlobalState->LargeArenaCounters[rank][counter].load();
- }
- }
- ThreadManager->EnumerateThreadStatesAsync(
- [&] (const auto* state) {
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
- result[rank][counter] += state->LargeArenaCounters[rank][counter];
- }
- }
- });
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- result[rank][ELargeArenaCounter::BytesUsed] = GetUsed(result[rank][ELargeArenaCounter::BytesAllocated], result[rank][ELargeArenaCounter::BytesFreed]);
- result[rank][ELargeArenaCounter::BlobsUsed] = GetUsed(result[rank][ELargeArenaCounter::BlobsAllocated], result[rank][ELargeArenaCounter::BlobsFreed]);
- }
- return result;
- }
- TLocalSystemCounters GetSystemAllocationCounters()
- {
- TLocalSystemCounters result;
- for (auto counter : TEnumTraits<ESystemCounter>::GetDomainValues()) {
- result[counter] = SystemCounters_[counter].load();
- }
- result[ESystemCounter::BytesUsed] = GetUsed(result[ESystemCounter::BytesAllocated], result[ESystemCounter::BytesFreed]);
- return result;
- }
- TLocalHugeCounters GetHugeAllocationCounters()
- {
- TLocalHugeCounters result;
- for (auto counter : TEnumTraits<EHugeCounter>::GetDomainValues()) {
- result[counter] = HugeCounters_[counter].load();
- }
- result[EHugeCounter::BytesUsed] = GetUsed(result[EHugeCounter::BytesAllocated], result[EHugeCounter::BytesFreed]);
- result[EHugeCounter::BlobsUsed] = GetUsed(result[EHugeCounter::BlobsAllocated], result[EHugeCounter::BlobsFreed]);
- return result;
- }
- TLocalUndumpableCounters GetUndumpableAllocationCounters()
- {
- TLocalUndumpableCounters result;
- for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
- result[counter] = HugeUndumpableCounters_[counter].load();
- result[counter] += GlobalState->UndumpableCounters[counter].load();
- }
- ThreadManager->EnumerateThreadStatesAsync(
- [&] (const auto* state) {
- result[EUndumpableCounter::BytesAllocated] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesAllocated]);
- result[EUndumpableCounter::BytesFreed] += LoadCounter(state->UndumpableCounters[EUndumpableCounter::BytesFreed]);
- });
- result[EUndumpableCounter::BytesUsed] = GetUsed(result[EUndumpableCounter::BytesAllocated], result[EUndumpableCounter::BytesFreed]);
- return result;
- }
- // Called before TThreadState is destroyed.
- // Adds the counter values from TThreadState to the global counters.
- void AccumulateLocalCounters(TThreadState* state)
- {
- for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
- GlobalState->TotalCounters.CumulativeTaggedCounters[counter] += state->TotalCounters.CumulativeTaggedCounters[counter];
- GlobalState->TotalCounters.UntaggedCounters[counter] += state->TotalCounters.UntaggedCounters[counter];
- }
- for (size_t index = 0; index < MaxTaggedCounterSets; ++index) {
- const auto* localSet = state->TotalCounters.FindTaggedCounterSet(index);
- if (!localSet) {
- continue;
- }
- auto* globalSet = GlobalState->TotalCounters.GetOrCreateTaggedCounterSet(index);
- for (size_t jndex = 0; jndex < TaggedCounterSetSize; ++jndex) {
- for (auto counter : TEnumTraits<EBasicCounter>::GetDomainValues()) {
- globalSet->Counters[jndex][counter] += localSet->Counters[jndex][counter];
- }
- }
- }
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- for (auto counter : TEnumTraits<ELargeArenaCounter>::GetDomainValues()) {
- GlobalState->LargeArenaCounters[rank][counter] += state->LargeArenaCounters[rank][counter];
- }
- }
- for (auto counter : TEnumTraits<EUndumpableCounter>::GetDomainValues()) {
- GlobalState->UndumpableCounters[counter] += state->UndumpableCounters[counter];
- }
- }
- private:
- template <class TCounter>
- static ssize_t LoadTaggedTotalCounter(const TTotalCounters<TCounter>& counters, TMemoryTag tag, EBasicCounter counter)
- {
- const auto* set = counters.FindTaggedCounterSet(tag / TaggedCounterSetSize);
- if (Y_UNLIKELY(!set)) {
- return 0;
- }
- return LoadCounter(set->Counters[tag % TaggedCounterSetSize][counter]);
- }
- template <class TCounter>
- static Y_FORCE_INLINE void IncrementUntaggedTotalCounter(TTotalCounters<TCounter>* counters, EBasicCounter counter, ssize_t delta)
- {
- counters->UntaggedCounters[counter] += delta;
- }
- template <class TCounter>
- static Y_FORCE_INLINE void IncrementTaggedTotalCounter(TTotalCounters<TCounter>* counters, TMemoryTag tag, EBasicCounter counter, ssize_t delta)
- {
- counters->CumulativeTaggedCounters[counter] += delta;
- auto* set = counters->GetOrCreateTaggedCounterSet(tag / TaggedCounterSetSize);
- set->Counters[tag % TaggedCounterSetSize][counter] += delta;
- }
- static ssize_t GetProcessRss()
- {
- auto* file = ::fopen("/proc/self/statm", "r");
- if (!file) {
- return 0;
- }
- ssize_t dummy;
- ssize_t rssPages;
- auto readResult = fscanf(file, "%zd %zd", &dummy, &rssPages);
- ::fclose(file);
- if (readResult != 2) {
- return 0;
- }
- return rssPages * PageSize;
- }
- private:
- TGlobalSystemCounters SystemCounters_;
- std::array<TGlobalSmallCounters, SmallRankCount> SmallArenaCounters_;
- TGlobalHugeCounters HugeCounters_;
- TGlobalUndumpableCounters HugeUndumpableCounters_;
- };
- TExplicitlyConstructableSingleton<TStatisticsManager> StatisticsManager;
- ////////////////////////////////////////////////////////////////////////////////
- void* TSystemAllocator::Allocate(size_t size)
- {
- auto rawSize = GetRawBlobSize<TSystemBlobHeader>(size);
- void* mmappedPtr;
- while (true) {
- auto currentPtr = CurrentPtr_.fetch_add(rawSize);
- Y_ABORT_UNLESS(currentPtr + rawSize <= SystemZoneEnd);
- mmappedPtr = MappedMemoryManager->Map(
- currentPtr,
- rawSize,
- MAP_FIXED_NOREPLACE | MAP_POPULATE);
- if (mmappedPtr == reinterpret_cast<void*>(currentPtr)) {
- break;
- }
- if (mmappedPtr != MAP_FAILED) {
- MappedMemoryManager->Unmap(mmappedPtr, rawSize);
- }
- }
- auto* blob = static_cast<TSystemBlobHeader*>(mmappedPtr);
- new (blob) TSystemBlobHeader(size);
- auto* result = HeaderToPtr(blob);
- PoisonUninitializedRange(result, size);
- StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, rawSize);
- return result;
- }
- void TSystemAllocator::Free(void* ptr)
- {
- auto* blob = PtrToHeader<TSystemBlobHeader>(ptr);
- auto rawSize = GetRawBlobSize<TSystemBlobHeader>(blob->Size);
- MappedMemoryManager->Unmap(blob, rawSize);
- StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesFreed, rawSize);
- }
- ////////////////////////////////////////////////////////////////////////////////
- // Small allocator
- //
- // Allocations (called small chunks) are grouped by their sizes. Two most-significant binary digits are
- // used to determine the rank of a chunk, which guarantees 25% overhead in the worst case.
- // A pair of helper arrays (SizeToSmallRank1 and SizeToSmallRank2) are used to compute ranks; we expect
- // them to be permanently cached.
- //
- // Chunks of the same rank are served by a (small) arena allocator.
- // In fact, there are two arenas for each rank: one is for tagged allocations and another is for untagged ones.
- //
- // We encode chunk's rank and whether it is tagged or not in the resulting pointer as follows:
- // 0- 3: must be zero due to alignment
- // 4-39: varies
- // 40-44: rank
- // 45: 0 for untagged allocations, 1 for tagged ones
- // 45-63: zeroes
- // This enables computing chunk's rank and also determining if it is tagged in constant time
- // without any additional lookups. Also, one pays no space overhead for untagged allocations
- // and pays 16 bytes for each tagged one.
- //
- // Each arena allocates extents of memory by calling mmap for each extent of SmallExtentSize bytes.
- // (Recall that this memory is never reclaimed.)
- // Each extent is then sliced into segments of SmallSegmentSize bytes.
- // Whenever a new segment is acquired, its memory is pre-faulted by madvise(MADV_POPULATE).
- // New segments are acquired in a lock-free manner.
- //
- // Each thread maintains a separate cache of chunks of each rank (two caches to be precise: one
- // for tagged allocations and the other for untagged). These caches are fully thread-local and
- // involve no atomic operations.
- //
- // There are also global caches (per rank, for tagged and untagged allocations).
- // Instead of keeping individual chunks these work with chunk groups (collections of up to ChunksPerGroup
- // arbitrary chunks).
- //
- // When the local cache becomes exhausted, a group of chunks is fetched from the global cache
- // (if the latter is empty then the arena allocator is consulted).
- // Vice versa, if the local cache overflows, a group of chunks is moved from it to the global cache.
- //
- // Global caches and arena allocators also take care of (rare) cases when Allocate/Free is called
- // without a valid thread state (which happens during thread shutdown when TThreadState is already destroyed).
- //
- // Each arena allocates memory in a certain "data" zone of SmallZoneSize.
- // In addition to that zone, up to two "shadow" zones are maintained.
- //
- // The first one contains memory tags of chunks residing in the primary zone.
- // The second one (which is present if YTALLOC_NERVOUS is defined) contains
- // states of chunks. These states enable some simple internal sanity checks
- // (e.g. detect attempts to double-free a chunk).
- //
- // Addresses in the data zone are directly mapped to offsets in shadow zones.
- // When a segment of a small arena zone is allocated, the relevant portions of shadow
- // zones get initialized (and also accounted for as a system allocation).
- //
- // Shadow zones are memory-mapped with MAP_NORESERVE flag and are quite sparse.
- // These zones are omitted from core dumps due to their huge size and sparsity.
- // For each small rank i, gives max K such that 2^k <= SmallRankToSize[i].
- // Chunk pointer is mapped to its shadow image via GetShadowOffset helper.
- // Note that chunk size is not always a power of 2. To avoid costly integer division,
- // chunk pointer is translated by means of bitwise shift only (leaving some bytes
- // of shadow zones unused). This array provides the needed shifts.
- constexpr int SmallRankToLogSize[SmallRankCount] = {
- 0,
- 4, 5, 5, 6, 6, 7,
- 7, 8, 8, 9, 9, 10, 10, 11,
- 11, 12, 12, 13, 13, 14, 14, 15
- };
- enum class ESmallChunkState : ui8
- {
- Spare = 0,
- Allocated = 0x61, // a
- Freed = 0x66 // f
- };
- class TSmallArenaAllocator
- {
- public:
- TSmallArenaAllocator(EAllocationKind kind, size_t rank, uintptr_t dataZoneStart)
- : Kind_(kind)
- , Rank_(rank)
- , LogSize_(SmallRankToLogSize[Rank_])
- , ChunkSize_(SmallRankToSize[Rank_])
- , DataZoneStart_(dataZoneStart)
- , DataZoneAllocator_(DataZoneStart_, DataZoneStart_ + SmallZoneSize)
- { }
- size_t PullMany(void** batch, size_t maxCount)
- {
- size_t count;
- while (true) {
- count = TryAllocateFromCurrentExtent(batch, maxCount);
- if (Y_LIKELY(count != 0)) {
- break;
- }
- PopulateAnotherExtent();
- }
- return count;
- }
- void* Allocate(size_t size)
- {
- void* ptr;
- auto count = PullMany(&ptr, 1);
- YTALLOC_PARANOID_ASSERT(count == 1);
- YTALLOC_PARANOID_ASSERT(PtrToSmallRank(ptr) == Rank_);
- PoisonUninitializedRange(ptr, size);
- UpdateChunkState(ptr, ESmallChunkState::Freed, ESmallChunkState::Allocated);
- return ptr;
- }
- TMemoryTag GetAndResetMemoryTag(const void* ptr)
- {
- auto& tag = MemoryTagZoneStart_[GetShadowOffset(ptr)];
- auto currentTag = tag;
- tag = NullMemoryTag;
- return currentTag;
- }
- void SetMemoryTag(void* ptr, TMemoryTag tag)
- {
- MemoryTagZoneStart_[GetShadowOffset(ptr)] = tag;
- }
- void UpdateChunkState(const void* ptr, ESmallChunkState expectedState, ESmallChunkState newState)
- {
- #ifdef YTALLOC_NERVOUS
- auto& state = ChunkStateZoneStart_[GetShadowOffset(ptr)];
- auto actualState = state;
- if (Y_UNLIKELY(actualState != expectedState)) {
- char message[256];
- sprintf(message, "Invalid small chunk state at %p: expected %" PRIx8 ", actual %" PRIx8,
- ptr,
- static_cast<ui8>(expectedState),
- static_cast<ui8>(actualState));
- YTALLOC_TRAP(message);
- }
- state = newState;
- #else
- Y_UNUSED(ptr);
- Y_UNUSED(expectedState);
- Y_UNUSED(newState);
- #endif
- }
- private:
- size_t TryAllocateFromCurrentExtent(void** batch, size_t maxCount)
- {
- auto* oldPtr = CurrentPtr_.load();
- if (Y_UNLIKELY(!oldPtr)) {
- return 0;
- }
- auto* currentExtent = CurrentExtent_.load(std::memory_order_relaxed);
- if (Y_UNLIKELY(!currentExtent)) {
- return 0;
- }
- char* newPtr;
- while (true) {
- if (Y_UNLIKELY(oldPtr < currentExtent || oldPtr + ChunkSize_ + RightReadableAreaSize > currentExtent + SmallExtentSize)) {
- return 0;
- }
- newPtr = std::min(
- oldPtr + ChunkSize_ * maxCount,
- currentExtent + SmallExtentSize);
- auto* alignedNewPtr = AlignDownToSmallSegment(currentExtent, newPtr);
- if (alignedNewPtr > oldPtr) {
- newPtr = alignedNewPtr;
- }
- if (Y_LIKELY(CurrentPtr_.compare_exchange_weak(oldPtr, newPtr))) {
- break;
- }
- }
- auto* firstSegment = AlignUpToSmallSegment(currentExtent, oldPtr);
- auto* nextSegment = AlignUpToSmallSegment(currentExtent, newPtr);
- if (firstSegment != nextSegment) {
- auto size = nextSegment - firstSegment;
- MappedMemoryManager->PopulateReadOnly(firstSegment, size);
- StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesCommitted, Rank_, size);
- StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesCommitted, Rank_, size / PageSize);
- if (Kind_ == EAllocationKind::Tagged) {
- StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(TMemoryTag));
- }
- #ifdef YTALLOC_NERVOUS
- StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, size / ChunkSize_ * sizeof(ESmallChunkState));
- #endif
- }
- size_t count = 0;
- while (oldPtr != newPtr) {
- UpdateChunkState(oldPtr, ESmallChunkState::Spare, ESmallChunkState::Freed);
- batch[count] = oldPtr;
- oldPtr += ChunkSize_;
- count++;
- }
- return count;
- }
- void PopulateAnotherExtent()
- {
- auto lockGuard = GuardWithTiming(ExtentLock_);
- auto* currentPtr = CurrentPtr_.load();
- auto* currentExtent = CurrentExtent_.load();
- if (currentPtr && currentPtr + ChunkSize_ + RightReadableAreaSize <= currentExtent + SmallExtentSize) {
- // No need for a new extent.
- return;
- }
- auto* newExtent = static_cast<char*>(DataZoneAllocator_.Allocate(SmallExtentAllocSize, 0));
- AllocateShadowZones();
- YTALLOC_VERIFY(reinterpret_cast<uintptr_t>(newExtent) % SmallExtentAllocSize == 0);
- CurrentPtr_ = CurrentExtent_ = newExtent;
- StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::BytesMapped, Rank_, SmallExtentAllocSize);
- StatisticsManager->IncrementSmallArenaCounter(ESmallArenaCounter::PagesMapped, Rank_, SmallExtentAllocSize / PageSize);
- }
- private:
- const EAllocationKind Kind_;
- const size_t Rank_;
- const size_t LogSize_;
- const size_t ChunkSize_;
- const uintptr_t DataZoneStart_;
- TZoneAllocator DataZoneAllocator_;
- bool ShadowZonesAllocated_ = false;
- TMemoryTag* MemoryTagZoneStart_;
- #ifdef YTALLOC_NERVOUS
- ESmallChunkState* ChunkStateZoneStart_;
- #endif
- NThreading::TForkAwareSpinLock ExtentLock_;
- std::atomic<char*> CurrentPtr_ = nullptr;
- std::atomic<char*> CurrentExtent_ = nullptr;
- size_t GetShadowOffset(const void* ptr)
- {
- return (reinterpret_cast<uintptr_t>(ptr) - DataZoneStart_) >> LogSize_;
- }
- void AllocateShadowZones()
- {
- if (ShadowZonesAllocated_) {
- return;
- }
- if (Kind_ == EAllocationKind::Tagged) {
- MemoryTagZoneStart_ = MapShadowZone<TMemoryTag>();
- }
- #ifdef YTALLOC_NERVOUS
- ChunkStateZoneStart_ = MapShadowZone<ESmallChunkState>();
- #endif
- ShadowZonesAllocated_ = true;
- }
- template <class T>
- T* MapShadowZone()
- {
- auto size = AlignUp((SmallZoneSize >> LogSize_) * sizeof (T), PageSize);
- auto* ptr = static_cast<T*>(MappedMemoryManager->Map(SystemZoneStart, size, MAP_NORESERVE));
- MappedMemoryManager->DontDump(ptr, size);
- return ptr;
- }
- };
- TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, std::array<TExplicitlyConstructableSingleton<TSmallArenaAllocator>, SmallRankCount>>> SmallArenaAllocators;
- ////////////////////////////////////////////////////////////////////////////////
- constexpr size_t ChunksPerGroup = 128;
- constexpr size_t GroupsBatchSize = 1024;
- static_assert(ChunksPerGroup <= MaxCachedChunksPerRank, "ChunksPerGroup > MaxCachedChunksPerRank");
- class TChunkGroup
- : public TFreeListItemBase<TChunkGroup>
- {
- public:
- bool IsEmpty() const
- {
- return Size_ == 0;
- }
- size_t ExtractAll(void** ptrs)
- {
- auto count = Size_;
- ::memcpy(ptrs, Ptrs_.data(), count * sizeof(void*));
- Size_ = 0;
- return count;
- }
- void PutOne(void* ptr)
- {
- PutMany(&ptr, 1);
- }
- void PutMany(void** ptrs, size_t count)
- {
- YTALLOC_PARANOID_ASSERT(Size_ == 0);
- YTALLOC_PARANOID_ASSERT(count <= ChunksPerGroup);
- ::memcpy(Ptrs_.data(), ptrs, count * sizeof(void*));
- Size_ = count;
- }
- private:
- size_t Size_ = 0; // <= ChunksPerGroup
- std::array<void*, ChunksPerGroup> Ptrs_;
- };
- class TGlobalSmallChunkCache
- {
- public:
- explicit TGlobalSmallChunkCache(EAllocationKind kind)
- : Kind_(kind)
- { }
- #ifdef YTALLOC_PARANOID
- void CanonizeChunkPtrs(TThreadState* state, size_t rank)
- {
- auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
- auto leftBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
- auto rightBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
- state->SmallBlobCache[Kind_].CachedChunkFull[rank] = false;
- if (chunkPtrPtr + 1 == rightBorder) {
- chunkPtrPtr = leftBorder;
- state->SmallBlobCache[Kind_].CachedChunkFull[rank] = true;
- }
- state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = leftBorder;
- }
- #endif
- bool TryMoveGroupToLocal(TThreadState* state, size_t rank)
- {
- auto& groups = RankToChunkGroups_[rank];
- auto* group = groups.Extract(state);
- if (!Y_LIKELY(group)) {
- return false;
- }
- YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
- auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
- #ifdef YTALLOC_PARANOID
- chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
- state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank] = chunkPtrPtr;
- #endif
- auto chunkCount = group->ExtractAll(chunkPtrPtr + 1);
- chunkPtrPtr += chunkCount;
- #ifdef YTALLOC_PARANOID
- CanonizeChunkPtrs(state, rank);
- #endif
- GroupPool_.Free(state, group);
- return true;
- }
- void MoveGroupToGlobal(TThreadState* state, size_t rank)
- {
- auto* group = GroupPool_.Allocate(state);
- auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
- YTALLOC_PARANOID_ASSERT(*(chunkPtrPtr + 1) == reinterpret_cast<void*>(TThreadState::RightSentinel));
- group->PutMany(chunkPtrPtr - ChunksPerGroup + 1, ChunksPerGroup);
- chunkPtrPtr -= ChunksPerGroup;
- #ifdef YTALLOC_PARANOID
- ::memset(chunkPtrPtr + 1, 0, sizeof(void*) * ChunksPerGroup);
- CanonizeChunkPtrs(state, rank);
- #endif
- auto& groups = RankToChunkGroups_[rank];
- YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
- groups.Put(state, group);
- }
- void MoveOneToGlobal(void* ptr, size_t rank)
- {
- auto* group = GroupPool_.Allocate(&GlobalShardedState_);
- group->PutOne(ptr);
- auto& groups = RankToChunkGroups_[rank];
- YTALLOC_PARANOID_ASSERT(!group->IsEmpty());
- groups.Put(&GlobalShardedState_, group);
- }
- #ifdef YTALLOC_PARANOID
- void MoveAllToGlobal(TThreadState* state, size_t rank)
- {
- auto leftSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkLeftBorder[rank];
- auto rightSentinelBorder = state->SmallBlobCache[Kind_].RankToCachedChunkRightBorder[rank];
- auto& headPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
- auto& tailPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrTail[rank];
- if (tailPtr == headPtr && !state->SmallBlobCache[Kind_].CachedChunkFull[rank]) {
- headPtr = leftSentinelBorder;
- return;
- }
- // (leftBorder, rightBorder]
- auto moveIntervalToGlobal = [=] (void** leftBorder, void** rightBorder) {
- while (true) {
- size_t count = 0;
- while (count < ChunksPerGroup && rightBorder != leftBorder) {
- --rightBorder;
- ++count;
- }
- if (count == 0) {
- break;
- }
- auto* group = GroupPool_.Allocate(state);
- group->PutMany(rightBorder + 1, count);
- ::memset(rightBorder + 1, 0, sizeof(void*) * count);
- auto& groups = RankToChunkGroups_[rank];
- groups.Put(state, group);
- }
- };
- if (tailPtr >= headPtr) {
- moveIntervalToGlobal(tailPtr, rightSentinelBorder - 1);
- moveIntervalToGlobal(leftSentinelBorder, headPtr);
- } else {
- moveIntervalToGlobal(tailPtr, headPtr);
- }
- headPtr = leftSentinelBorder;
- }
- #else
- void MoveAllToGlobal(TThreadState* state, size_t rank)
- {
- auto& chunkPtrPtr = state->SmallBlobCache[Kind_].RankToCachedChunkPtrHead[rank];
- while (true) {
- size_t count = 0;
- while (count < ChunksPerGroup && *chunkPtrPtr != reinterpret_cast<void*>(TThreadState::LeftSentinel)) {
- --chunkPtrPtr;
- ++count;
- }
- if (count == 0) {
- break;
- }
- auto* group = GroupPool_.Allocate(state);
- group->PutMany(chunkPtrPtr + 1, count);
- auto& groups = RankToChunkGroups_[rank];
- groups.Put(state, group);
- }
- }
- #endif
- private:
- const EAllocationKind Kind_;
- TGlobalShardedState GlobalShardedState_;
- TShardedSystemPool<TChunkGroup, GroupsBatchSize> GroupPool_;
- std::array<TShardedFreeList<TChunkGroup>, SmallRankCount> RankToChunkGroups_;
- };
- TExplicitlyConstructableSingleton<TEnumIndexedArray<EAllocationKind, TExplicitlyConstructableSingleton<TGlobalSmallChunkCache>>> GlobalSmallChunkCaches;
- ////////////////////////////////////////////////////////////////////////////////
- class TSmallAllocator
- {
- public:
- template <EAllocationKind Kind>
- static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank)
- {
- auto* state = TThreadManager::FindThreadState();
- if (Y_LIKELY(state)) {
- return Allocate<Kind>(tag, rank, state);
- }
- auto size = SmallRankToSize[rank];
- return AllocateGlobal<Kind>(tag, rank, size);
- }
- #ifdef YTALLOC_PARANOID
- template <EAllocationKind Kind>
- static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
- {
- auto& localCache = state->SmallBlobCache[Kind];
- auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
- size_t size = SmallRankToSize[rank];
- StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
- auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
- auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
- void* result;
- while (true) {
- auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
- auto& cachedHeadPtr = *(chunkHeadPtr + 1);
- auto* headPtr = cachedHeadPtr;
- auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
- auto& cachedTailPtr = *(chunkTailPtr + 1);
- auto* tailPtr = cachedTailPtr;
- auto& chunkFull = localCache.CachedChunkFull[rank];
- if (Y_LIKELY(chunkFull || headPtr != tailPtr)) {
- YTALLOC_PARANOID_ASSERT(tailPtr);
- cachedTailPtr = nullptr;
- ++chunkTailPtr;
- if (Y_LIKELY(chunkTailPtr + 1 == rightBorder)) {
- chunkTailPtr = leftBorder;
- }
- chunkFull = false;
- result = tailPtr;
- PoisonUninitializedRange(result, size);
- allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
- break;
- }
- auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
- if (!globalCache.TryMoveGroupToLocal(state, rank)) {
- result = allocator.Allocate(size);
- break;
- }
- }
- if constexpr(Kind == EAllocationKind::Tagged) {
- allocator.SetMemoryTag(result, tag);
- }
- return result;
- }
- template <EAllocationKind Kind>
- static Y_FORCE_INLINE void Free(void* ptr)
- {
- auto rank = PtrToSmallRank(ptr);
- auto size = SmallRankToSize[rank];
- auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
- auto tag = NullMemoryTag;
- if constexpr(Kind == EAllocationKind::Tagged) {
- tag = allocator.GetAndResetMemoryTag(ptr);
- YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
- }
- allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
- PoisonFreedRange(ptr, size);
- auto* state = TThreadManager::FindThreadState();
- if (Y_UNLIKELY(!state)) {
- FreeGlobal<Kind>(tag, ptr, rank, size);
- return;
- }
- StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
- auto& localCache = state->SmallBlobCache[Kind];
- auto leftBorder = localCache.RankToCachedChunkLeftBorder[rank];
- auto rightBorder = localCache.RankToCachedChunkRightBorder[rank];
- while (true) {
- auto& chunkHeadPtr = localCache.RankToCachedChunkPtrHead[rank];
- auto& headPtr = *(chunkHeadPtr + 1);
- auto& chunkTailPtr = localCache.RankToCachedChunkPtrTail[rank];
- auto& chunkFull = localCache.CachedChunkFull[rank];
- if (Y_LIKELY(!chunkFull)) {
- headPtr = ptr;
- ++chunkHeadPtr;
- if (Y_LIKELY(chunkHeadPtr + 1 == rightBorder)) {
- chunkHeadPtr = leftBorder;
- }
- chunkFull = (chunkHeadPtr == chunkTailPtr);
- break;
- }
- chunkHeadPtr = rightBorder - 1;
- chunkTailPtr = leftBorder;
- auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
- globalCache.MoveGroupToGlobal(state, rank);
- }
- }
- #else
- template <EAllocationKind Kind>
- static Y_FORCE_INLINE void* Allocate(TMemoryTag tag, size_t rank, TThreadState* state)
- {
- size_t size = SmallRankToSize[rank];
- StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesAllocated, size);
- auto& localCache = state->SmallBlobCache[Kind];
- auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
- void* result;
- while (true) {
- auto& chunkPtr = localCache.RankToCachedChunkPtrHead[rank];
- auto& cachedPtr = *chunkPtr;
- auto* ptr = cachedPtr;
- if (Y_LIKELY(ptr != reinterpret_cast<void*>(TThreadState::LeftSentinel))) {
- --chunkPtr;
- result = ptr;
- allocator.UpdateChunkState(result, ESmallChunkState::Freed, ESmallChunkState::Allocated);
- PoisonUninitializedRange(result, size);
- break;
- }
- auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
- if (globalCache.TryMoveGroupToLocal(state, rank)) {
- continue;
- }
- auto count = allocator.PullMany(
- chunkPtr + 1,
- SmallRankBatchSize[rank]);
- chunkPtr += count;
- }
- if constexpr(Kind == EAllocationKind::Tagged) {
- allocator.SetMemoryTag(result, tag);
- }
- return result;
- }
- template <EAllocationKind Kind>
- static Y_FORCE_INLINE void Free(void* ptr)
- {
- auto rank = PtrToSmallRank(ptr);
- auto size = SmallRankToSize[rank];
- auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
- auto tag = NullMemoryTag;
- if constexpr(Kind == EAllocationKind::Tagged) {
- tag = allocator.GetAndResetMemoryTag(ptr);
- YTALLOC_PARANOID_ASSERT(tag != NullMemoryTag);
- }
- allocator.UpdateChunkState(ptr, ESmallChunkState::Allocated, ESmallChunkState::Freed);
- PoisonFreedRange(ptr, size);
- auto* state = TThreadManager::FindThreadState();
- if (Y_UNLIKELY(!state)) {
- FreeGlobal<Kind>(tag, ptr, rank, size);
- return;
- }
- StatisticsManager->IncrementTotalCounter<Kind>(state, tag, EBasicCounter::BytesFreed, size);
- auto& localCache = state->SmallBlobCache[Kind];
- while (true) {
- auto& chunkPtrPtr = localCache.RankToCachedChunkPtrHead[rank];
- auto& chunkPtr = *(chunkPtrPtr + 1);
- if (Y_LIKELY(chunkPtr != reinterpret_cast<void*>(TThreadState::RightSentinel))) {
- chunkPtr = ptr;
- ++chunkPtrPtr;
- break;
- }
- auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
- globalCache.MoveGroupToGlobal(state, rank);
- }
- }
- #endif
- static size_t GetAllocationSize(const void* ptr)
- {
- return SmallRankToSize[PtrToSmallRank(ptr)];
- }
- static size_t GetAllocationSize(size_t size)
- {
- return SmallRankToSize[SizeToSmallRank(size)];
- }
- static void PurgeCaches()
- {
- DoPurgeCaches<EAllocationKind::Untagged>();
- DoPurgeCaches<EAllocationKind::Tagged>();
- }
- private:
- template <EAllocationKind Kind>
- static void DoPurgeCaches()
- {
- auto* state = TThreadManager::GetThreadStateChecked();
- for (size_t rank = 0; rank < SmallRankCount; ++rank) {
- (*GlobalSmallChunkCaches)[Kind]->MoveAllToGlobal(state, rank);
- }
- }
- template <EAllocationKind Kind>
- static void* AllocateGlobal(TMemoryTag tag, size_t rank, size_t size)
- {
- StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
- auto& allocator = *(*SmallArenaAllocators)[Kind][rank];
- auto* result = allocator.Allocate(size);
- if constexpr(Kind == EAllocationKind::Tagged) {
- allocator.SetMemoryTag(result, tag);
- }
- return result;
- }
- template <EAllocationKind Kind>
- static void FreeGlobal(TMemoryTag tag, void* ptr, size_t rank, size_t size)
- {
- StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
- auto& globalCache = *(*GlobalSmallChunkCaches)[Kind];
- globalCache.MoveOneToGlobal(ptr, rank);
- }
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Large blob allocator
- //
- // Like for small chunks, large blobs are grouped into arenas, where arena K handles
- // blobs of size (2^{K-1},2^K]. Memory is mapped in extents of LargeExtentSize bytes.
- // Each extent is split into segments of size 2^K (here segment is just a memory region, which may fully consist of
- // unmapped pages). When a segment is actually allocated, it becomes a blob and a TLargeBlobHeader
- // structure is placed at its start.
- //
- // When an extent is allocated, it is sliced into segments (not blobs, since no headers are placed and
- // no memory is touched). These segments are put into disposed segments list.
- //
- // For each blob two separate sizes are maintained: BytesAcquired indicates the number of bytes
- // acquired via madvise(MADV_POPULATE) from the system; BytesAllocated (<= BytesAcquired) corresponds
- // to the number of bytes claimed by the user (including the header and page size alignment).
- // If BytesAllocated == 0 then this blob is spare, i.e.
- // was freed and remains cached for further possible reuse.
- //
- // When a new blob is being allocated, the allocator first tries to extract a spare blob. On success,
- // its acquired size is extended (if needed); the acquired size never shrinks on allocation.
- // If no spare blobs exist, a disposed segment is extracted and is turned into a blob (i.e.
- // its header is initialized) and the needed number of bytes is acquired. If no disposed segments
- // exist, then a new extent is allocated and sliced into segments.
- //
- // The above algorithm only claims memory from the system (by means of madvise(MADV_POPULATE));
- // the reclaim is handled by a separate background mechanism. Two types of reclaimable memory
- // regions are possible:
- // * spare: these correspond to spare blobs; upon reclaiming this region becomes a disposed segment
- // * overhead: these correspond to trailing parts of allocated blobs in [BytesAllocated, BytesAcquired) byte range
- //
- // Reclaiming spare blobs is easy as these are explicitly tracked by spare blob lists. To reclaim,
- // we atomically extract a blob from a spare list, call madvise(MADV_FREE), and put the pointer to
- // the disposed segment list.
- //
- // Reclaiming overheads is more complicated since (a) allocated blobs are never tracked directly and
- // (b) reclaiming them may interfere with Allocate and Free.
- //
- // To overcome (a), for each extent we maintain a bitmap marking segments that are actually blobs
- // (i.e. contain a header). (For simplicity and efficiency this bitmap is just a vector of bytes.)
- // These flags are updated in Allocate/Free with appropriate memory ordering. Note that
- // blobs are only disposed (and are turned into segments) by the background thread; if this
- // thread discovers a segment that is marked as a blob, then it is safe to assume that this segment
- // remains a blob unless the thread disposes it.
- //
- // To overcome (b), each large blob header maintains a spin lock. When blob B is extracted
- // from a spare list in Allocate, an acquisition is tried. If successful, B is returned to the
- // user. Otherwise it is assumed that B is currently being examined by the background
- // reclaimer thread. Allocate then skips this blob and retries extraction; the problem is that
- // since the spare list is basically a stack one cannot just push B back into the spare list.
- // Instead, B is pushed into a special locked spare list. This list is purged by the background
- // thread on each tick and its items are pushed back into the usual spare list.
- //
- // A similar trick is used by Free: when invoked for blob B its spin lock acquisition is first
- // tried. Upon success, B is moved to the spare list. On failure, Free has to postpone this deallocation
- // by moving B into the freed locked list. This list, similarly, is being purged by the background thread.
- //
- // It remains to explain how the background thread computes the number of bytes to be reclaimed from
- // each arena. To this aim, we first compute the total number of reclaimable bytes.
- // This is the sum of spare and overhead bytes in all arenas minus the number of unreclaimable bytes
- // The latter grows linearly in the number of used bytes and is capped from below by a MinUnreclaimableLargeBytes;
- // and from above by MaxUnreclaimableLargeBytes. SetLargeUnreclaimableCoeff and Set(Min|Max)LargeUnreclaimableBytes
- // enable tuning these control knobs. The reclaimable bytes are being taken from arenas starting from those
- // with the largest spare and overhead volumes.
- //
- // The above implies that each large blob contains a fixed-size header preceeding it.
- // Hence ptr % PageSize == sizeof (TLargeBlobHeader) for each ptr returned by Allocate
- // (since large blob sizes are larger than PageSize and are divisible by PageSize).
- // For AllocatePageAligned, however, ptr must be divisible by PageSize. To handle such an allocation, we
- // artificially increase its size and align the result of Allocate up to the next page boundary.
- // When handling a deallocation, ptr is moved back by UnalignPtr (which is capable of dealing
- // with both the results of Allocate and AllocatePageAligned).
- // This technique applies to both large and huge blobs.
- enum ELargeBlobState : ui64
- {
- Allocated = 0x6c6c61656772616cULL, // largeall
- Spare = 0x727073656772616cULL, // largespr
- LockedSpare = 0x70736c656772616cULL, // largelsp
- LockedFreed = 0x72666c656772616cULL // largelfr
- };
- // Every large blob (either tagged or not) is prepended with this header.
- struct TLargeBlobHeader
- : public TFreeListItemBase<TLargeBlobHeader>
- {
- TLargeBlobHeader(
- TLargeBlobExtent* extent,
- size_t bytesAcquired,
- size_t bytesAllocated,
- TMemoryTag tag)
- : Extent(extent)
- , BytesAcquired(bytesAcquired)
- , Tag(tag)
- , BytesAllocated(bytesAllocated)
- , State(ELargeBlobState::Allocated)
- { }
- TLargeBlobExtent* Extent;
- // Number of bytes in all acquired pages.
- size_t BytesAcquired;
- std::atomic<bool> Locked = false;
- TMemoryTag Tag = NullMemoryTag;
- // For spare blobs this is zero.
- // For allocated blobs this is the number of bytes requested by user (not including header of any alignment).
- size_t BytesAllocated;
- ELargeBlobState State;
- char Padding[12];
- };
- CHECK_HEADER_ALIGNMENT(TLargeBlobHeader)
- struct TLargeBlobExtent
- {
- TLargeBlobExtent(size_t segmentCount, char* ptr)
- : SegmentCount(segmentCount)
- , Ptr(ptr)
- { }
- size_t SegmentCount;
- char* Ptr;
- TLargeBlobExtent* NextExtent = nullptr;
- std::atomic<bool> DisposedFlags[0];
- };
- // A helper node that enables storing a number of extent's segments
- // in a free list. Recall that segments themselves do not posses any headers.
- struct TDisposedSegment
- : public TFreeListItemBase<TDisposedSegment>
- {
- size_t Index;
- TLargeBlobExtent* Extent;
- };
- struct TLargeArena
- {
- size_t Rank = 0;
- size_t SegmentSize = 0;
- TShardedFreeList<TLargeBlobHeader> SpareBlobs;
- TFreeList<TLargeBlobHeader> LockedSpareBlobs;
- TFreeList<TLargeBlobHeader> LockedFreedBlobs;
- TFreeList<TDisposedSegment> DisposedSegments;
- std::atomic<TLargeBlobExtent*> FirstExtent = nullptr;
- TLargeBlobExtent* CurrentOverheadScanExtent = nullptr;
- size_t CurrentOverheadScanSegment = 0;
- };
- template <bool Dumpable>
- class TLargeBlobAllocator
- {
- public:
- TLargeBlobAllocator()
- : ZoneAllocator_(LargeZoneStart(Dumpable), LargeZoneEnd(Dumpable))
- {
- for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
- auto& arena = Arenas_[rank];
- arena.Rank = rank;
- arena.SegmentSize = (1ULL << rank);
- }
- }
- void* Allocate(size_t size)
- {
- auto* state = TThreadManager::FindThreadState();
- return Y_LIKELY(state)
- ? DoAllocate(state, size)
- : DoAllocate(GlobalState.Get(), size);
- }
- void Free(void* ptr)
- {
- auto* state = TThreadManager::FindThreadState();
- if (Y_LIKELY(state)) {
- DoFree(state, ptr);
- } else {
- DoFree(GlobalState.Get(), ptr);
- }
- }
- static size_t GetAllocationSize(const void* ptr)
- {
- UnalignPtr<TLargeBlobHeader>(ptr);
- const auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
- return blob->BytesAllocated;
- }
- static size_t GetAllocationSize(size_t size)
- {
- return GetBlobAllocationSize<TLargeBlobHeader>(size);
- }
- void RunBackgroundTasks()
- {
- ReinstallLockedBlobs();
- ReclaimMemory();
- }
- void SetBacktraceProvider(TBacktraceProvider provider)
- {
- BacktraceProvider_.store(provider);
- }
- private:
- template <class TState>
- void PopulateArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
- {
- MappedMemoryManager->Populate(ptr, size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesPopulated, size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesPopulated, size / PageSize);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, size / PageSize);
- }
- template <class TState>
- void ReleaseArenaPages(TState* state, TLargeArena* arena, void* ptr, size_t size)
- {
- MappedMemoryManager->Release(ptr, size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesReleased, size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesReleased, size / PageSize);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::BytesCommitted, -size);
- StatisticsManager->IncrementLargeArenaCounter(state, arena->Rank, ELargeArenaCounter::PagesCommitted, -size / PageSize);
- }
- bool TryLockBlob(TLargeBlobHeader* blob)
- {
- bool expected = false;
- return blob->Locked.compare_exchange_strong(expected, true);
- }
- void UnlockBlob(TLargeBlobHeader* blob)
- {
- blob->Locked.store(false);
- }
- template <class TState>
- void MoveBlobToSpare(TState* state, TLargeArena* arena, TLargeBlobHeader* blob, bool unlock)
- {
- auto rank = arena->Rank;
- auto size = blob->BytesAllocated;
- auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, blob->BytesAcquired);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -(blob->BytesAcquired - rawSize));
- blob->BytesAllocated = 0;
- if (unlock) {
- UnlockBlob(blob);
- } else {
- YTALLOC_VERIFY(!blob->Locked.load());
- }
- blob->State = ELargeBlobState::Spare;
- arena->SpareBlobs.Put(state, blob);
- }
- size_t GetBytesToReclaim(const std::array<TLocalLargeCounters, LargeRankCount>& arenaCounters)
- {
- size_t totalBytesAllocated = 0;
- size_t totalBytesFreed = 0;
- size_t totalBytesSpare = 0;
- size_t totalBytesOverhead = 0;
- for (size_t rank = 0; rank < Arenas_.size(); ++rank) {
- const auto& counters = arenaCounters[rank];
- totalBytesAllocated += counters[ELargeArenaCounter::BytesAllocated];
- totalBytesFreed += counters[ELargeArenaCounter::BytesFreed];
- totalBytesSpare += counters[ELargeArenaCounter::BytesSpare];
- totalBytesOverhead += counters[ELargeArenaCounter::BytesOverhead];
- }
- auto totalBytesUsed = totalBytesAllocated - totalBytesFreed;
- auto totalBytesReclaimable = totalBytesSpare + totalBytesOverhead;
- auto threshold = ClampVal(
- static_cast<size_t>(ConfigurationManager->GetLargeUnreclaimableCoeff() * totalBytesUsed),
- ConfigurationManager->GetMinLargeUnreclaimableBytes(),
- ConfigurationManager->GetMaxLargeUnreclaimableBytes());
- if (totalBytesReclaimable < threshold) {
- return 0;
- }
- auto bytesToReclaim = totalBytesReclaimable - threshold;
- return AlignUp(bytesToReclaim, PageSize);
- }
- void ReinstallLockedSpareBlobs(TLargeArena* arena)
- {
- auto* blob = arena->LockedSpareBlobs.ExtractAll();
- auto* state = TThreadManager::GetThreadStateChecked();
- size_t count = 0;
- while (blob) {
- auto* nextBlob = blob->Next.load();
- YTALLOC_VERIFY(!blob->Locked.load());
- AssertBlobState(blob, ELargeBlobState::LockedSpare);
- blob->State = ELargeBlobState::Spare;
- arena->SpareBlobs.Put(state, blob);
- blob = nextBlob;
- ++count;
- }
- if (count > 0) {
- YTALLOC_LOG_DEBUG("Locked spare blobs reinstalled (Rank: %d, Blobs: %zu)",
- arena->Rank,
- count);
- }
- }
- void ReinstallLockedFreedBlobs(TLargeArena* arena)
- {
- auto* state = TThreadManager::GetThreadStateChecked();
- auto* blob = arena->LockedFreedBlobs.ExtractAll();
- size_t count = 0;
- while (blob) {
- auto* nextBlob = blob->Next.load();
- AssertBlobState(blob, ELargeBlobState::LockedFreed);
- MoveBlobToSpare(state, arena, blob, false);
- ++count;
- blob = nextBlob;
- }
- if (count > 0) {
- YTALLOC_LOG_DEBUG("Locked freed blobs reinstalled (Rank: %d, Blobs: %zu)",
- arena->Rank,
- count);
- }
- }
- void ReclaimSpareMemory(TLargeArena* arena, ssize_t bytesToReclaim)
- {
- if (bytesToReclaim <= 0) {
- return;
- }
- auto rank = arena->Rank;
- auto* state = TThreadManager::GetThreadStateChecked();
- YTALLOC_LOG_DEBUG("Started processing spare memory in arena (BytesToReclaim: %zdM, Rank: %d)",
- bytesToReclaim / 1_MB,
- rank);
- size_t bytesReclaimed = 0;
- size_t blobsReclaimed = 0;
- while (bytesToReclaim > 0) {
- auto* blob = arena->SpareBlobs.ExtractRoundRobin(state);
- if (!blob) {
- break;
- }
- AssertBlobState(blob, ELargeBlobState::Spare);
- YTALLOC_VERIFY(blob->BytesAllocated == 0);
- auto bytesAcquired = blob->BytesAcquired;
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -bytesAcquired);
- bytesToReclaim -= bytesAcquired;
- bytesReclaimed += bytesAcquired;
- blobsReclaimed += 1;
- auto* extent = blob->Extent;
- auto* ptr = reinterpret_cast<char*>(blob);
- ReleaseArenaPages(
- state,
- arena,
- ptr,
- bytesAcquired);
- size_t segmentIndex = (ptr - extent->Ptr) / arena->SegmentSize;
- extent->DisposedFlags[segmentIndex].store(true, std::memory_order_relaxed);
- auto* disposedSegment = DisposedSegmentPool_.Allocate();
- disposedSegment->Index = segmentIndex;
- disposedSegment->Extent = extent;
- arena->DisposedSegments.Put(disposedSegment);
- }
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::SpareBytesReclaimed, bytesReclaimed);
- YTALLOC_LOG_DEBUG("Finished processing spare memory in arena (Rank: %d, BytesReclaimed: %zdM, BlobsReclaimed: %zu)",
- arena->Rank,
- bytesReclaimed / 1_MB,
- blobsReclaimed);
- }
- void ReclaimOverheadMemory(TLargeArena* arena, ssize_t bytesToReclaim)
- {
- if (bytesToReclaim == 0) {
- return;
- }
- auto* state = TThreadManager::GetThreadStateChecked();
- auto rank = arena->Rank;
- YTALLOC_LOG_DEBUG("Started processing overhead memory in arena (BytesToReclaim: %zdM, Rank: %d)",
- bytesToReclaim / 1_MB,
- rank);
- size_t extentsTraversed = 0;
- size_t segmentsTraversed = 0;
- size_t bytesReclaimed = 0;
- bool restartedFromFirstExtent = false;
- auto& currentExtent = arena->CurrentOverheadScanExtent;
- auto& currentSegment = arena->CurrentOverheadScanSegment;
- while (bytesToReclaim > 0) {
- if (!currentExtent) {
- if (restartedFromFirstExtent) {
- break;
- }
- currentExtent = arena->FirstExtent.load();
- if (!currentExtent) {
- break;
- }
- restartedFromFirstExtent = true;
- }
- while (currentSegment < currentExtent->SegmentCount && bytesToReclaim > 0) {
- ++segmentsTraversed;
- if (!currentExtent->DisposedFlags[currentSegment].load(std::memory_order_acquire)) {
- auto* ptr = currentExtent->Ptr + currentSegment * arena->SegmentSize;
- auto* blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
- YTALLOC_PARANOID_ASSERT(blob->Extent == currentExtent);
- if (TryLockBlob(blob)) {
- if (blob->BytesAllocated > 0) {
- size_t rawSize = GetRawBlobSize<TLargeBlobHeader>(blob->BytesAllocated);
- size_t bytesToRelease = blob->BytesAcquired - rawSize;
- if (bytesToRelease > 0) {
- ReleaseArenaPages(
- state,
- arena,
- ptr + blob->BytesAcquired - bytesToRelease,
- bytesToRelease);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, -bytesToRelease);
- blob->BytesAcquired = rawSize;
- bytesToReclaim -= bytesToRelease;
- bytesReclaimed += bytesToRelease;
- }
- }
- UnlockBlob(blob);
- }
- }
- ++currentSegment;
- }
- ++extentsTraversed;
- currentSegment = 0;
- currentExtent = currentExtent->NextExtent;
- }
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::OverheadBytesReclaimed, bytesReclaimed);
- YTALLOC_LOG_DEBUG("Finished processing overhead memory in arena (Rank: %d, Extents: %zu, Segments: %zu, BytesReclaimed: %zuM)",
- arena->Rank,
- extentsTraversed,
- segmentsTraversed,
- bytesReclaimed / 1_MB);
- }
- void ReinstallLockedBlobs()
- {
- for (auto& arena : Arenas_) {
- ReinstallLockedSpareBlobs(&arena);
- ReinstallLockedFreedBlobs(&arena);
- }
- }
- void ReclaimMemory()
- {
- auto arenaCounters = StatisticsManager->GetLargeArenaAllocationCounters();
- ssize_t bytesToReclaim = GetBytesToReclaim(arenaCounters);
- if (bytesToReclaim == 0) {
- return;
- }
- YTALLOC_LOG_DEBUG("Memory reclaim started (BytesToReclaim: %zdM)",
- bytesToReclaim / 1_MB);
- std::array<ssize_t, LargeRankCount * 2> bytesReclaimablePerArena;
- for (size_t rank = 0; rank < LargeRankCount; ++rank) {
- bytesReclaimablePerArena[rank * 2] = arenaCounters[rank][ELargeArenaCounter::BytesOverhead];
- bytesReclaimablePerArena[rank * 2 + 1] = arenaCounters[rank][ELargeArenaCounter::BytesSpare];
- }
- std::array<ssize_t, LargeRankCount * 2> bytesToReclaimPerArena{};
- while (bytesToReclaim > 0) {
- ssize_t maxBytes = std::numeric_limits<ssize_t>::min();
- int maxIndex = -1;
- for (int index = 0; index < LargeRankCount * 2; ++index) {
- if (bytesReclaimablePerArena[index] > maxBytes) {
- maxBytes = bytesReclaimablePerArena[index];
- maxIndex = index;
- }
- }
- if (maxIndex < 0) {
- break;
- }
- auto bytesToReclaimPerStep = std::min<ssize_t>({bytesToReclaim, maxBytes, 4_MB});
- if (bytesToReclaimPerStep < 0) {
- break;
- }
- bytesToReclaimPerArena[maxIndex] += bytesToReclaimPerStep;
- bytesReclaimablePerArena[maxIndex] -= bytesToReclaimPerStep;
- bytesToReclaim -= bytesToReclaimPerStep;
- }
- for (auto& arena : Arenas_) {
- auto rank = arena.Rank;
- ReclaimOverheadMemory(&arena, bytesToReclaimPerArena[rank * 2]);
- ReclaimSpareMemory(&arena, bytesToReclaimPerArena[rank * 2 + 1]);
- }
- YTALLOC_LOG_DEBUG("Memory reclaim finished");
- }
- template <class TState>
- void AllocateArenaExtent(TState* state, TLargeArena* arena)
- {
- auto rank = arena->Rank;
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::ExtentsAllocated, 1);
- size_t segmentCount = LargeExtentSize / arena->SegmentSize;
- size_t extentHeaderSize = AlignUp(sizeof (TLargeBlobExtent) + sizeof (TLargeBlobExtent::DisposedFlags[0]) * segmentCount, PageSize);
- size_t allocationSize = extentHeaderSize + LargeExtentSize;
- auto* ptr = ZoneAllocator_.Allocate(allocationSize, MAP_NORESERVE);
- if (!Dumpable) {
- MappedMemoryManager->DontDump(ptr, allocationSize);
- }
- if (auto backtraceProvider = BacktraceProvider_.load()) {
- std::array<void*, MaxAllocationProfilingBacktraceDepth> frames;
- auto frameCount = backtraceProvider(
- frames.data(),
- MaxAllocationProfilingBacktraceDepth,
- 3);
- MmapObservationManager->EnqueueEvent(allocationSize, frames, frameCount);
- }
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesMapped, allocationSize);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesMapped, allocationSize / PageSize);
- auto* extent = static_cast<TLargeBlobExtent*>(ptr);
- MappedMemoryManager->Populate(ptr, extentHeaderSize);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesPopulated, extentHeaderSize);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::PagesPopulated, extentHeaderSize / PageSize);
- StatisticsManager->IncrementSystemCounter(ESystemCounter::BytesAllocated, extentHeaderSize);
- new (extent) TLargeBlobExtent(segmentCount, static_cast<char*>(ptr) + extentHeaderSize);
- for (size_t index = 0; index < segmentCount; ++index) {
- auto* disposedSegment = DisposedSegmentPool_.Allocate();
- disposedSegment->Index = index;
- disposedSegment->Extent = extent;
- arena->DisposedSegments.Put(disposedSegment);
- extent->DisposedFlags[index].store(true);
- }
- auto* expectedFirstExtent = arena->FirstExtent.load();
- do {
- extent->NextExtent = expectedFirstExtent;
- } while (Y_UNLIKELY(!arena->FirstExtent.compare_exchange_weak(expectedFirstExtent, extent)));
- }
- template <class TState>
- void* DoAllocate(TState* state, size_t size)
- {
- auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
- auto rank = GetLargeRank(rawSize);
- auto tag = ConfigurationManager->IsLargeArenaAllocationProfiled(rank)
- ? BacktraceManager->GetMemoryTagFromBacktrace(3)
- : TThreadManager::GetCurrentMemoryTag();
- auto& arena = Arenas_[rank];
- YTALLOC_PARANOID_ASSERT(rawSize <= arena.SegmentSize);
- TLargeBlobHeader* blob;
- while (true) {
- blob = arena.SpareBlobs.Extract(state);
- if (blob) {
- AssertBlobState(blob, ELargeBlobState::Spare);
- if (TryLockBlob(blob)) {
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesSpare, -blob->BytesAcquired);
- if (blob->BytesAcquired < rawSize) {
- PopulateArenaPages(
- state,
- &arena,
- reinterpret_cast<char*>(blob) + blob->BytesAcquired,
- rawSize - blob->BytesAcquired);
- blob->BytesAcquired = rawSize;
- } else {
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesOverhead, blob->BytesAcquired - rawSize);
- }
- YTALLOC_PARANOID_ASSERT(blob->BytesAllocated == 0);
- blob->BytesAllocated = size;
- blob->Tag = tag;
- blob->State = ELargeBlobState::Allocated;
- UnlockBlob(blob);
- break;
- } else {
- blob->State = ELargeBlobState::LockedSpare;
- arena.LockedSpareBlobs.Put(blob);
- }
- }
- auto* disposedSegment = arena.DisposedSegments.Extract();
- if (disposedSegment) {
- auto index = disposedSegment->Index;
- auto* extent = disposedSegment->Extent;
- DisposedSegmentPool_.Free(disposedSegment);
- auto* ptr = extent->Ptr + index * arena.SegmentSize;
- PopulateArenaPages(
- state,
- &arena,
- ptr,
- rawSize);
- blob = reinterpret_cast<TLargeBlobHeader*>(ptr);
- new (blob) TLargeBlobHeader(extent, rawSize, size, tag);
- extent->DisposedFlags[index].store(false, std::memory_order_release);
- break;
- }
- AllocateArenaExtent(state, &arena);
- }
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsAllocated, 1);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesAllocated, size);
- StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesAllocated, size);
- if (!Dumpable) {
- StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesAllocated, size);
- }
- auto* result = HeaderToPtr(blob);
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(Dumpable));
- PoisonUninitializedRange(result, size);
- return result;
- }
- template <class TState>
- void DoFree(TState* state, void* ptr)
- {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(Dumpable) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(Dumpable));
- auto* blob = PtrToHeader<TLargeBlobHeader>(ptr);
- AssertBlobState(blob, ELargeBlobState::Allocated);
- auto size = blob->BytesAllocated;
- PoisonFreedRange(ptr, size);
- auto rawSize = GetRawBlobSize<TLargeBlobHeader>(size);
- auto rank = GetLargeRank(rawSize);
- auto& arena = Arenas_[rank];
- YTALLOC_PARANOID_ASSERT(blob->BytesAcquired <= arena.SegmentSize);
- auto tag = blob->Tag;
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BlobsFreed, 1);
- StatisticsManager->IncrementLargeArenaCounter(state, rank, ELargeArenaCounter::BytesFreed, size);
- StatisticsManager->IncrementTotalCounter(state, tag, EBasicCounter::BytesFreed, size);
- if (!Dumpable) {
- StatisticsManager->IncrementUndumpableCounter(state, EUndumpableCounter::BytesFreed, size);
- }
- if (TryLockBlob(blob)) {
- MoveBlobToSpare(state, &arena, blob, true);
- } else {
- blob->State = ELargeBlobState::LockedFreed;
- arena.LockedFreedBlobs.Put(blob);
- }
- }
- private:
- TZoneAllocator ZoneAllocator_;
- std::array<TLargeArena, LargeRankCount> Arenas_;
- static constexpr size_t DisposedSegmentsBatchSize = 1024;
- TSystemPool<TDisposedSegment, DisposedSegmentsBatchSize> DisposedSegmentPool_;
- std::atomic<TBacktraceProvider> BacktraceProvider_ = nullptr;
- };
- TExplicitlyConstructableSingleton<TLargeBlobAllocator<true>> DumpableLargeBlobAllocator;
- TExplicitlyConstructableSingleton<TLargeBlobAllocator<false>> UndumpableLargeBlobAllocator;
- ////////////////////////////////////////////////////////////////////////////////
- // Huge blob allocator
- //
- // Basically a wrapper for TZoneAllocator.
- // Acts as a signature to detect broken headers.
- enum class EHugeBlobState : ui64
- {
- Allocated = 0x72666c656772616cULL // hugeallc
- };
- // Every huge blob (both tagged or not) is prepended with this header.
- struct THugeBlobHeader
- {
- THugeBlobHeader(TMemoryTag tag, size_t size, bool dumpable)
- : Tag(tag)
- , Size(size)
- , State(EHugeBlobState::Allocated)
- , Dumpable(dumpable)
- { }
- TMemoryTag Tag;
- size_t Size;
- EHugeBlobState State;
- bool Dumpable;
- char Padding[7];
- };
- CHECK_HEADER_ALIGNMENT(THugeBlobHeader)
- class THugeBlobAllocator
- {
- public:
- THugeBlobAllocator()
- : ZoneAllocator_(HugeZoneStart, HugeZoneEnd)
- { }
- void* Allocate(size_t size, bool dumpable)
- {
- YTALLOC_VERIFY(size <= MaxAllocationSize);
- auto tag = TThreadManager::GetCurrentMemoryTag();
- auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
- auto* blob = static_cast<THugeBlobHeader*>(ZoneAllocator_.Allocate(rawSize, MAP_POPULATE));
- if (!dumpable) {
- MappedMemoryManager->DontDump(blob, rawSize);
- }
- new (blob) THugeBlobHeader(tag, size, dumpable);
- StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesAllocated, size);
- StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsAllocated, 1);
- StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesAllocated, size);
- if (!dumpable) {
- StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesAllocated, size);
- }
- auto* result = HeaderToPtr(blob);
- PoisonUninitializedRange(result, size);
- return result;
- }
- void Free(void* ptr)
- {
- auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
- AssertBlobState(blob, EHugeBlobState::Allocated);
- auto tag = blob->Tag;
- auto size = blob->Size;
- auto dumpable = blob->Dumpable;
- PoisonFreedRange(ptr, size);
- auto rawSize = GetRawBlobSize<THugeBlobHeader>(size);
- ZoneAllocator_.Free(blob, rawSize);
- StatisticsManager->IncrementTotalCounter(tag, EBasicCounter::BytesFreed, size);
- StatisticsManager->IncrementHugeCounter(EHugeCounter::BlobsFreed, 1);
- StatisticsManager->IncrementHugeCounter(EHugeCounter::BytesFreed, size);
- if (!dumpable) {
- StatisticsManager->IncrementHugeUndumpableCounter(EUndumpableCounter::BytesFreed, size);
- }
- }
- static size_t GetAllocationSize(const void* ptr)
- {
- UnalignPtr<THugeBlobHeader>(ptr);
- const auto* blob = PtrToHeader<THugeBlobHeader>(ptr);
- return blob->Size;
- }
- static size_t GetAllocationSize(size_t size)
- {
- return GetBlobAllocationSize<THugeBlobHeader>(size);
- }
- private:
- TZoneAllocator ZoneAllocator_;
- };
- TExplicitlyConstructableSingleton<THugeBlobAllocator> HugeBlobAllocator;
- ////////////////////////////////////////////////////////////////////////////////
- // A thunk to large and huge blob allocators
- class TBlobAllocator
- {
- public:
- static void* Allocate(size_t size)
- {
- InitializeGlobals();
- bool dumpable = GetCurrentMemoryZone() != EMemoryZone::Undumpable;
- // NB: Account for the header. Also note that we may safely ignore the alignment since
- // HugeAllocationSizeThreshold is already page-aligned.
- if (Y_LIKELY(size < HugeAllocationSizeThreshold - sizeof(TLargeBlobHeader) - RightReadableAreaSize)) {
- void* result = dumpable
- ? DumpableLargeBlobAllocator->Allocate(size)
- : UndumpableLargeBlobAllocator->Allocate(size);
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= LargeZoneStart(dumpable) && reinterpret_cast<uintptr_t>(result) < LargeZoneEnd(dumpable));
- return result;
- } else {
- auto* result = HugeBlobAllocator->Allocate(size, dumpable);
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= HugeZoneStart && reinterpret_cast<uintptr_t>(result) < HugeZoneEnd);
- return result;
- }
- }
- static void Free(void* ptr)
- {
- InitializeGlobals();
- if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true)) {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(true) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(true));
- UnalignPtr<TLargeBlobHeader>(ptr);
- DumpableLargeBlobAllocator->Free(ptr);
- } else if (reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false)) {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= LargeZoneStart(false) && reinterpret_cast<uintptr_t>(ptr) < LargeZoneEnd(false));
- UnalignPtr<TLargeBlobHeader>(ptr);
- UndumpableLargeBlobAllocator->Free(ptr);
- } else if (reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd) {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= HugeZoneStart && reinterpret_cast<uintptr_t>(ptr) < HugeZoneEnd);
- UnalignPtr<THugeBlobHeader>(ptr);
- HugeBlobAllocator->Free(ptr);
- } else {
- YTALLOC_TRAP("Wrong ptr passed to Free");
- }
- }
- };
- ////////////////////////////////////////////////////////////////////////////////
- Y_POD_THREAD(bool) CurrentThreadIsBackground;
- // Base class for all background threads.
- template <class T>
- class TBackgroundThreadBase
- {
- public:
- TBackgroundThreadBase()
- : State_(new TState())
- {
- NThreading::RegisterAtForkHandlers(
- [=] { BeforeFork(); },
- [=] { AfterForkParent(); },
- [=] { AfterForkChild(); });
- }
- virtual ~TBackgroundThreadBase()
- {
- Stop();
- }
- private:
- struct TState
- : public TSystemAllocatable
- {
- std::mutex StartStopMutex;
- std::optional<std::thread> Thread;
- std::mutex StopFlagMutex;
- std::condition_variable StopFlagVariable;
- std::chrono::system_clock::time_point LastInvocationTime;
- bool StopFlag = false;
- bool Paused = false;
- std::atomic<int> ForkDepth = 0;
- bool RestartAfterFork = false;
- };
- TState* State_;
- private:
- void BeforeFork()
- {
- bool stopped = Stop();
- if (State_->ForkDepth++ == 0) {
- State_->RestartAfterFork = stopped;
- }
- }
- void AfterForkParent()
- {
- if (--State_->ForkDepth == 0) {
- if (State_->RestartAfterFork) {
- Start(false);
- }
- }
- }
- void AfterForkChild()
- {
- bool restart = State_->RestartAfterFork;
- State_ = new TState();
- if (restart) {
- Start(false);
- }
- }
- virtual void ThreadMain() = 0;
- protected:
- void Start(bool fromAlloc)
- {
- std::unique_lock<std::mutex> guard(State_->StartStopMutex, std::defer_lock);
- if (fromAlloc) {
- if (!guard.try_lock()) {
- return;
- }
- if (State_->Paused) {
- return;
- }
- } else {
- guard.lock();
- }
- State_->Paused = false;
- if (State_->Thread) {
- return;
- }
- State_->StopFlag = false;
- State_->Thread.emplace([=] {
- CurrentThreadIsBackground = true;
- ThreadMain();
- });
- OnStart();
- }
- bool Stop()
- {
- std::unique_lock<std::mutex> guard(State_->StartStopMutex);
- State_->Paused = true;
- if (!State_->Thread) {
- return false;
- }
- std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
- State_->StopFlag = true;
- flagGuard.unlock();
- State_->StopFlagVariable.notify_one();
- State_->Thread->join();
- State_->Thread.reset();
- OnStop();
- return true;
- }
- bool IsDone(TDuration interval)
- {
- std::unique_lock<std::mutex> flagGuard(State_->StopFlagMutex);
- auto result = State_->StopFlagVariable.wait_until(
- flagGuard,
- State_->LastInvocationTime + std::chrono::microseconds(interval.MicroSeconds()),
- [&] { return State_->StopFlag; });
- State_->LastInvocationTime = std::chrono::system_clock::now();
- return result;
- }
- virtual void OnStart()
- { }
- virtual void OnStop()
- { }
- };
- ////////////////////////////////////////////////////////////////////////////////
- // Invokes madvise(MADV_STOCKPILE) periodically.
- class TStockpileThread
- : public TBackgroundThreadBase<TStockpileThread>
- {
- public:
- explicit TStockpileThread(int index)
- : Index_(index)
- {
- Start(false);
- }
- private:
- const int Index_;
- virtual void ThreadMain() override
- {
- TThread::SetCurrentThreadName(Sprintf("%s:%d", StockpileThreadName, Index_).c_str());
- while (!IsDone(ConfigurationManager->GetStockpileInterval())) {
- if (!MappedMemoryManager->Stockpile(ConfigurationManager->GetStockpileSize())) {
- // No use to proceed.
- YTALLOC_LOG_INFO("Stockpile call failed; terminating stockpile thread");
- break;
- }
- }
- }
- };
- // Manages a bunch of TStockpileThreads.
- class TStockpileManager
- {
- public:
- void SpawnIfNeeded()
- {
- if (!ConfigurationManager->IsStockpileEnabled()) {
- return;
- }
- int threadCount = ConfigurationManager->GetStockpileThreadCount();
- while (static_cast<int>(Threads_.size()) > threadCount) {
- Threads_.pop_back();
- }
- while (static_cast<int>(Threads_.size()) < threadCount) {
- Threads_.push_back(std::make_unique<TStockpileThread>(static_cast<int>(Threads_.size())));
- }
- }
- private:
- std::vector<std::unique_ptr<TStockpileThread>> Threads_;
- };
- TExplicitlyConstructableSingleton<TStockpileManager> StockpileManager;
- ////////////////////////////////////////////////////////////////////////////////
- // Time to wait before re-spawning the thread after a fork.
- static constexpr auto BackgroundThreadRespawnDelay = TDuration::Seconds(3);
- // Runs basic background activities: reclaim, logging, profiling etc.
- class TBackgroundThread
- : public TBackgroundThreadBase<TBackgroundThread>
- {
- public:
- bool IsStarted()
- {
- return Started_.load();
- }
- void SpawnIfNeeded()
- {
- if (CurrentThreadIsBackground) {
- return;
- }
- Start(true);
- }
- private:
- std::atomic<bool> Started_ = false;
- private:
- virtual void ThreadMain() override
- {
- TThread::SetCurrentThreadName(BackgroundThreadName);
- TimingManager->DisableForCurrentThread();
- MmapObservationManager->DisableForCurrentThread();
- while (!IsDone(BackgroundInterval)) {
- DumpableLargeBlobAllocator->RunBackgroundTasks();
- UndumpableLargeBlobAllocator->RunBackgroundTasks();
- MappedMemoryManager->RunBackgroundTasks();
- TimingManager->RunBackgroundTasks();
- MmapObservationManager->RunBackgroundTasks();
- StockpileManager->SpawnIfNeeded();
- }
- }
- virtual void OnStart() override
- {
- DoUpdateAllThreadsControlWord(true);
- }
- virtual void OnStop() override
- {
- DoUpdateAllThreadsControlWord(false);
- }
- void DoUpdateAllThreadsControlWord(bool started)
- {
- // Update threads' TLS.
- ThreadManager->EnumerateThreadStatesSync(
- [&] {
- Started_.store(started);
- },
- [&] (auto* state) {
- if (state->BackgroundThreadStarted) {
- *state->BackgroundThreadStarted = started;
- }
- });
- }
- };
- TExplicitlyConstructableSingleton<TBackgroundThread> BackgroundThread;
- ////////////////////////////////////////////////////////////////////////////////
- Y_FORCE_INLINE TThreadState* TThreadManager::GetThreadStateUnchecked()
- {
- YTALLOC_PARANOID_ASSERT(ThreadState_);
- return ThreadState_;
- }
- Y_FORCE_INLINE TThreadState* TThreadManager::FindThreadState()
- {
- if (Y_LIKELY(ThreadState_)) {
- return ThreadState_;
- }
- if (ThreadStateDestroyed_) {
- return nullptr;
- }
- InitializeGlobals();
- // InitializeGlobals must not allocate.
- Y_ABORT_UNLESS(!ThreadState_);
- ThreadState_ = ThreadManager->AllocateThreadState();
- (&ThreadControlWord_)->Parts.ThreadStateValid = true;
- return ThreadState_;
- }
- void TThreadManager::DestroyThread(void*)
- {
- TSmallAllocator::PurgeCaches();
- TThreadState* state = ThreadState_;
- ThreadState_ = nullptr;
- ThreadStateDestroyed_ = true;
- (&ThreadControlWord_)->Parts.ThreadStateValid = false;
- {
- auto guard = GuardWithTiming(ThreadManager->ThreadRegistryLock_);
- state->AllocationProfilingEnabled = nullptr;
- state->BackgroundThreadStarted = nullptr;
- ThreadManager->UnrefThreadState(state);
- }
- }
- void TThreadManager::DestroyThreadState(TThreadState* state)
- {
- StatisticsManager->AccumulateLocalCounters(state);
- ThreadRegistry_.Remove(state);
- ThreadStatePool_.Free(state);
- }
- void TThreadManager::AfterFork()
- {
- auto guard = GuardWithTiming(ThreadRegistryLock_);
- ThreadRegistry_.Clear();
- TThreadState* state = ThreadState_;
- if (state) {
- ThreadRegistry_.PushBack(state);
- }
- }
- TThreadState* TThreadManager::AllocateThreadState()
- {
- auto* state = ThreadStatePool_.Allocate();
- state->AllocationProfilingEnabled = &(*&ThreadControlWord_).Parts.AllocationProfilingEnabled;
- state->BackgroundThreadStarted = &(*&ThreadControlWord_).Parts.BackgroundThreadStarted;
- {
- auto guard = GuardWithTiming(ThreadRegistryLock_);
- // NB: These flags must be initialized under ThreadRegistryLock_; see EnumerateThreadStatesSync.
- *state->AllocationProfilingEnabled = ConfigurationManager->IsAllocationProfilingEnabled();
- *state->BackgroundThreadStarted = BackgroundThread->IsStarted();
- ThreadRegistry_.PushBack(state);
- }
- // Need to pass some non-null value for DestroyThread to be called.
- pthread_setspecific(ThreadDtorKey_, (void*)-1);
- return state;
- }
- ////////////////////////////////////////////////////////////////////////////////
- void InitializeGlobals()
- {
- static std::once_flag Initialized;
- std::call_once(Initialized, [] () {
- LogManager.Construct();
- BacktraceManager.Construct();
- StatisticsManager.Construct();
- MappedMemoryManager.Construct();
- ThreadManager.Construct();
- GlobalState.Construct();
- DumpableLargeBlobAllocator.Construct();
- UndumpableLargeBlobAllocator.Construct();
- HugeBlobAllocator.Construct();
- ConfigurationManager.Construct();
- SystemAllocator.Construct();
- TimingManager.Construct();
- MmapObservationManager.Construct();
- StockpileManager.Construct();
- BackgroundThread.Construct();
- SmallArenaAllocators.Construct();
- auto constructSmallArenaAllocators = [&] (EAllocationKind kind, uintptr_t zonesStart) {
- for (size_t rank = 1; rank < SmallRankCount; ++rank) {
- (*SmallArenaAllocators)[kind][rank].Construct(kind, rank, zonesStart + rank * SmallZoneSize);
- }
- };
- constructSmallArenaAllocators(EAllocationKind::Untagged, UntaggedSmallZonesStart);
- constructSmallArenaAllocators(EAllocationKind::Tagged, TaggedSmallZonesStart);
- GlobalSmallChunkCaches.Construct();
- (*GlobalSmallChunkCaches)[EAllocationKind::Tagged].Construct(EAllocationKind::Tagged);
- (*GlobalSmallChunkCaches)[EAllocationKind::Untagged].Construct(EAllocationKind::Untagged);
- });
- }
- ////////////////////////////////////////////////////////////////////////////////
- void StartBackgroundThread()
- {
- InitializeGlobals();
- BackgroundThread->SpawnIfNeeded();
- }
- ////////////////////////////////////////////////////////////////////////////////
- template <class... Ts>
- Y_FORCE_INLINE void* AllocateSmallUntagged(size_t rank, Ts... args)
- {
- auto* result = TSmallAllocator::Allocate<EAllocationKind::Untagged>(NullMemoryTag, rank, std::forward<Ts>(args)...);
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxUntaggedSmallPtr);
- return result;
- }
- template <class... Ts>
- Y_FORCE_INLINE void* AllocateSmallTagged(ui64 controlWord, size_t rank, Ts... args)
- {
- auto tag = Y_UNLIKELY((controlWord & TThreadManager::AllocationProfilingEnabledControlWordMask) && ConfigurationManager->IsSmallArenaAllocationProfiled(rank))
- ? BacktraceManager->GetMemoryTagFromBacktrace(2)
- : static_cast<TMemoryTag>(controlWord & TThreadManager::MemoryTagControlWordMask);
- auto* result = TSmallAllocator::Allocate<EAllocationKind::Tagged>(tag, rank, std::forward<Ts>(args)...);
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(result) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(result) < MaxTaggedSmallPtr);
- return result;
- }
- Y_FORCE_INLINE void* AllocateInline(size_t size)
- {
- size_t rank;
- if (Y_LIKELY(size <= 512)) {
- rank = SizeToSmallRank1[(size + 7) >> 3];
- } else if (Y_LIKELY(size < LargeAllocationSizeThreshold)) {
- rank = SizeToSmallRank2[(size - 1) >> 8];
- } else {
- StartBackgroundThread();
- return TBlobAllocator::Allocate(size);
- }
- auto controlWord = TThreadManager::GetThreadControlWord();
- if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
- return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
- }
- if (Y_UNLIKELY(!(controlWord & TThreadManager::BackgroundThreadStartedControlWorkMask))) {
- StartBackgroundThread();
- }
- if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
- return AllocateSmallUntagged(rank);
- } else {
- return AllocateSmallTagged(controlWord, rank);
- }
- }
- Y_FORCE_INLINE void* AllocateSmallInline(size_t rank)
- {
- auto controlWord = TThreadManager::GetThreadControlWord();
- if (Y_LIKELY(controlWord == TThreadManager::FastPathControlWord)) {
- return AllocateSmallUntagged(rank, TThreadManager::GetThreadStateUnchecked());
- }
- if (!(controlWord & (TThreadManager::MemoryTagControlWordMask | TThreadManager::AllocationProfilingEnabledControlWordMask))) {
- return AllocateSmallUntagged(rank);
- } else {
- return AllocateSmallTagged(controlWord, rank);
- }
- }
- Y_FORCE_INLINE void* AllocatePageAlignedInline(size_t size)
- {
- size = std::max(AlignUp(size, PageSize), PageSize);
- void* result = size >= LargeAllocationSizeThreshold
- ? AlignUp(TBlobAllocator::Allocate(size + PageSize), PageSize)
- : Allocate(size);
- YTALLOC_ASSERT(reinterpret_cast<uintptr_t>(result) % PageSize == 0);
- return result;
- }
- Y_FORCE_INLINE void FreeNonNullInline(void* ptr)
- {
- YTALLOC_ASSERT(ptr);
- if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < UntaggedSmallZonesEnd)) {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinUntaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxUntaggedSmallPtr);
- TSmallAllocator::Free<EAllocationKind::Untagged>(ptr);
- } else if (Y_LIKELY(reinterpret_cast<uintptr_t>(ptr) < TaggedSmallZonesEnd)) {
- YTALLOC_PARANOID_ASSERT(reinterpret_cast<uintptr_t>(ptr) >= MinTaggedSmallPtr && reinterpret_cast<uintptr_t>(ptr) < MaxTaggedSmallPtr);
- TSmallAllocator::Free<EAllocationKind::Tagged>(ptr);
- } else {
- TBlobAllocator::Free(ptr);
- }
- }
- Y_FORCE_INLINE void FreeInline(void* ptr)
- {
- if (Y_LIKELY(ptr)) {
- FreeNonNullInline(ptr);
- }
- }
- Y_FORCE_INLINE size_t GetAllocationSizeInline(const void* ptr)
- {
- if (Y_UNLIKELY(!ptr)) {
- return 0;
- }
- auto uintptr = reinterpret_cast<uintptr_t>(ptr);
- if (uintptr < UntaggedSmallZonesEnd) {
- YTALLOC_PARANOID_ASSERT(uintptr >= MinUntaggedSmallPtr && uintptr < MaxUntaggedSmallPtr);
- return TSmallAllocator::GetAllocationSize(ptr);
- } else if (uintptr < TaggedSmallZonesEnd) {
- YTALLOC_PARANOID_ASSERT(uintptr >= MinTaggedSmallPtr && uintptr < MaxTaggedSmallPtr);
- return TSmallAllocator::GetAllocationSize(ptr);
- } else if (uintptr < LargeZoneEnd(true)) {
- YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(true) && uintptr < LargeZoneEnd(true));
- return TLargeBlobAllocator<true>::GetAllocationSize(ptr);
- } else if (uintptr < LargeZoneEnd(false)) {
- YTALLOC_PARANOID_ASSERT(uintptr >= LargeZoneStart(false) && uintptr < LargeZoneEnd(false));
- return TLargeBlobAllocator<false>::GetAllocationSize(ptr);
- } else if (uintptr < HugeZoneEnd) {
- YTALLOC_PARANOID_ASSERT(uintptr >= HugeZoneStart && uintptr < HugeZoneEnd);
- return THugeBlobAllocator::GetAllocationSize(ptr);
- } else {
- YTALLOC_TRAP("Wrong ptr passed to GetAllocationSizeInline");
- }
- }
- Y_FORCE_INLINE size_t GetAllocationSizeInline(size_t size)
- {
- if (size <= LargeAllocationSizeThreshold) {
- return TSmallAllocator::GetAllocationSize(size);
- } else if (size <= HugeAllocationSizeThreshold) {
- return TLargeBlobAllocator<true>::GetAllocationSize(size);
- } else {
- return THugeBlobAllocator::GetAllocationSize(size);
- }
- }
- void EnableLogging(TLogHandler logHandler)
- {
- InitializeGlobals();
- LogManager->EnableLogging(logHandler);
- }
- void SetBacktraceProvider(TBacktraceProvider provider)
- {
- InitializeGlobals();
- BacktraceManager->SetBacktraceProvider(provider);
- DumpableLargeBlobAllocator->SetBacktraceProvider(provider);
- UndumpableLargeBlobAllocator->SetBacktraceProvider(provider);
- }
- void SetBacktraceFormatter(TBacktraceFormatter provider)
- {
- InitializeGlobals();
- MmapObservationManager->SetBacktraceFormatter(provider);
- }
- void EnableStockpile()
- {
- InitializeGlobals();
- ConfigurationManager->EnableStockpile();
- }
- void SetStockpileInterval(TDuration value)
- {
- InitializeGlobals();
- ConfigurationManager->SetStockpileInterval(value);
- }
- void SetStockpileThreadCount(int value)
- {
- InitializeGlobals();
- ConfigurationManager->SetStockpileThreadCount(value);
- }
- void SetStockpileSize(size_t value)
- {
- InitializeGlobals();
- ConfigurationManager->SetStockpileSize(value);
- }
- void SetLargeUnreclaimableCoeff(double value)
- {
- InitializeGlobals();
- ConfigurationManager->SetLargeUnreclaimableCoeff(value);
- }
- void SetTimingEventThreshold(TDuration value)
- {
- InitializeGlobals();
- ConfigurationManager->SetTimingEventThreshold(value);
- }
- void SetMinLargeUnreclaimableBytes(size_t value)
- {
- InitializeGlobals();
- ConfigurationManager->SetMinLargeUnreclaimableBytes(value);
- }
- void SetMaxLargeUnreclaimableBytes(size_t value)
- {
- InitializeGlobals();
- ConfigurationManager->SetMaxLargeUnreclaimableBytes(value);
- }
- void SetAllocationProfilingEnabled(bool value)
- {
- ConfigurationManager->SetAllocationProfilingEnabled(value);
- }
- void SetAllocationProfilingSamplingRate(double rate)
- {
- ConfigurationManager->SetAllocationProfilingSamplingRate(rate);
- }
- void SetSmallArenaAllocationProfilingEnabled(size_t rank, bool value)
- {
- ConfigurationManager->SetSmallArenaAllocationProfilingEnabled(rank, value);
- }
- void SetLargeArenaAllocationProfilingEnabled(size_t rank, bool value)
- {
- ConfigurationManager->SetLargeArenaAllocationProfilingEnabled(rank, value);
- }
- void SetProfilingBacktraceDepth(int depth)
- {
- ConfigurationManager->SetProfilingBacktraceDepth(depth);
- }
- void SetMinProfilingBytesUsedToReport(size_t size)
- {
- ConfigurationManager->SetMinProfilingBytesUsedToReport(size);
- }
- void SetEnableEagerMemoryRelease(bool value)
- {
- ConfigurationManager->SetEnableEagerMemoryRelease(value);
- }
- void SetEnableMadvisePopulate(bool value)
- {
- ConfigurationManager->SetEnableMadvisePopulate(value);
- }
- TEnumIndexedArray<ETotalCounter, ssize_t> GetTotalAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetTotalAllocationCounters();
- }
- TEnumIndexedArray<ESystemCounter, ssize_t> GetSystemAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetSystemAllocationCounters();
- }
- TEnumIndexedArray<ESystemCounter, ssize_t> GetUndumpableAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetUndumpableAllocationCounters();
- }
- TEnumIndexedArray<ESmallCounter, ssize_t> GetSmallAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetSmallAllocationCounters();
- }
- TEnumIndexedArray<ESmallCounter, ssize_t> GetLargeAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetLargeAllocationCounters();
- }
- std::array<TEnumIndexedArray<ESmallArenaCounter, ssize_t>, SmallRankCount> GetSmallArenaAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetSmallArenaAllocationCounters();
- }
- std::array<TEnumIndexedArray<ELargeArenaCounter, ssize_t>, LargeRankCount> GetLargeArenaAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetLargeArenaAllocationCounters();
- }
- TEnumIndexedArray<EHugeCounter, ssize_t> GetHugeAllocationCounters()
- {
- InitializeGlobals();
- return StatisticsManager->GetHugeAllocationCounters();
- }
- std::vector<TProfiledAllocation> GetProfiledAllocationStatistics()
- {
- InitializeGlobals();
- if (!ConfigurationManager->IsAllocationProfilingEnabled()) {
- return {};
- }
- std::vector<TMemoryTag> tags;
- tags.reserve(MaxCapturedAllocationBacktraces + 1);
- for (TMemoryTag tag = AllocationProfilingMemoryTagBase;
- tag < AllocationProfilingMemoryTagBase + MaxCapturedAllocationBacktraces;
- ++tag)
- {
- tags.push_back(tag);
- }
- tags.push_back(AllocationProfilingUnknownMemoryTag);
- std::vector<TEnumIndexedArray<EBasicCounter, ssize_t>> counters;
- counters.resize(tags.size());
- StatisticsManager->GetTaggedMemoryCounters(tags.data(), tags.size(), counters.data());
- std::vector<TProfiledAllocation> statistics;
- for (size_t index = 0; index < tags.size(); ++index) {
- if (counters[index][EBasicCounter::BytesUsed] < static_cast<ssize_t>(ConfigurationManager->GetMinProfilingBytesUsedToReport())) {
- continue;
- }
- auto tag = tags[index];
- auto optionalBacktrace = BacktraceManager->FindBacktrace(tag);
- if (!optionalBacktrace && tag != AllocationProfilingUnknownMemoryTag) {
- continue;
- }
- statistics.push_back(TProfiledAllocation{
- optionalBacktrace.value_or(TBacktrace()),
- counters[index]
- });
- }
- return statistics;
- }
- TEnumIndexedArray<ETimingEventType, TTimingEventCounters> GetTimingEventCounters()
- {
- InitializeGlobals();
- return TimingManager->GetTimingEventCounters();
- }
- ////////////////////////////////////////////////////////////////////////////////
- } // namespace NYT::NYTAlloc
|