123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982 |
- /*
- * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
- */
- //===----------------------------------------------------------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- /* Dynamic scheduling initialization and dispatch.
- *
- * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
- * it may change values between parallel regions. __kmp_max_nth
- * is the largest value __kmp_nth may take, 1 is the smallest.
- */
- #include "kmp.h"
- #include "kmp_error.h"
- #include "kmp_i18n.h"
- #include "kmp_itt.h"
- #include "kmp_stats.h"
- #include "kmp_str.h"
- #if KMP_USE_X87CONTROL
- #include <float.h>
- #endif
- #include "kmp_lock.h"
- #include "kmp_dispatch.h"
- #if KMP_USE_HIER_SCHED
- #error #include "kmp_dispatch_hier.h"
- #endif
- #if OMPT_SUPPORT
- #include "ompt-specific.h"
- #endif
- /* ------------------------------------------------------------------------ */
- /* ------------------------------------------------------------------------ */
- void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
- kmp_info_t *th;
- KMP_DEBUG_ASSERT(gtid_ref);
- if (__kmp_env_consistency_check) {
- th = __kmp_threads[*gtid_ref];
- if (th->th.th_root->r.r_active &&
- (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
- #if KMP_USE_DYNAMIC_LOCK
- __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
- #else
- __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
- #endif
- }
- }
- }
- void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
- kmp_info_t *th;
- if (__kmp_env_consistency_check) {
- th = __kmp_threads[*gtid_ref];
- if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
- __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
- }
- }
- }
- // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
- static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
- bool use_hier = false) {
- // Pick up the nonmonotonic/monotonic bits from the scheduling type
- // Nonmonotonic as default for dynamic schedule when no modifier is specified
- int monotonicity = SCHEDULE_NONMONOTONIC;
- // Let default be monotonic for executables
- // compiled with OpenMP* 4.5 or less compilers
- if (loc != NULL && loc->get_openmp_version() < 50)
- monotonicity = SCHEDULE_MONOTONIC;
- if (use_hier || __kmp_force_monotonic)
- monotonicity = SCHEDULE_MONOTONIC;
- else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
- monotonicity = SCHEDULE_NONMONOTONIC;
- else if (SCHEDULE_HAS_MONOTONIC(schedule))
- monotonicity = SCHEDULE_MONOTONIC;
- return monotonicity;
- }
- #if KMP_STATIC_STEAL_ENABLED
- enum { // values for steal_flag (possible states of private per-loop buffer)
- UNUSED = 0,
- CLAIMED = 1, // owner thread started initialization
- READY = 2, // available for stealing
- THIEF = 3 // finished by owner, or claimed by thief
- // possible state changes:
- // 0 -> 1 owner only, sync
- // 0 -> 3 thief only, sync
- // 1 -> 2 owner only, async
- // 2 -> 3 owner only, async
- // 3 -> 2 owner only, async
- // 3 -> 0 last thread finishing the loop, async
- };
- #endif
- // Initialize a dispatch_private_info_template<T> buffer for a particular
- // type of schedule,chunk. The loop description is found in lb (lower bound),
- // ub (upper bound), and st (stride). nproc is the number of threads relevant
- // to the scheduling (often the number of threads in a team, but not always if
- // hierarchical scheduling is used). tid is the id of the thread calling
- // the function within the group of nproc threads. It will have a value
- // between 0 and nproc - 1. This is often just the thread id within a team, but
- // is not necessarily the case when using hierarchical scheduling.
- // loc is the source file location of the corresponding loop
- // gtid is the global thread id
- template <typename T>
- void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
- dispatch_private_info_template<T> *pr,
- enum sched_type schedule, T lb, T ub,
- typename traits_t<T>::signed_t st,
- #if USE_ITT_BUILD
- kmp_uint64 *cur_chunk,
- #endif
- typename traits_t<T>::signed_t chunk,
- T nproc, T tid) {
- typedef typename traits_t<T>::unsigned_t UT;
- typedef typename traits_t<T>::floating_t DBL;
- int active;
- T tc;
- kmp_info_t *th;
- kmp_team_t *team;
- int monotonicity;
- bool use_hier;
- #ifdef KMP_DEBUG
- typedef typename traits_t<T>::signed_t ST;
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
- "pr:%%p lb:%%%s ub:%%%s st:%%%s "
- "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
- traits_t<T>::spec, traits_t<T>::spec,
- traits_t<ST>::spec, traits_t<ST>::spec,
- traits_t<T>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
- __kmp_str_free(&buff);
- }
- #endif
- /* setup data */
- th = __kmp_threads[gtid];
- team = th->th.th_team;
- active = !team->t.t_serialized;
- #if USE_ITT_BUILD
- int itt_need_metadata_reporting =
- __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
- KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
- team->t.t_active_level == 1;
- #endif
- #if KMP_USE_HIER_SCHED
- use_hier = pr->flags.use_hier;
- #else
- use_hier = false;
- #endif
- /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
- monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
- /* Pick up the nomerge/ordered bits from the scheduling type */
- if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
- pr->flags.nomerge = TRUE;
- schedule =
- (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
- } else {
- pr->flags.nomerge = FALSE;
- }
- pr->type_size = traits_t<T>::type_size; // remember the size of variables
- if (kmp_ord_lower & schedule) {
- pr->flags.ordered = TRUE;
- schedule =
- (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
- } else {
- pr->flags.ordered = FALSE;
- }
- // Ordered overrides nonmonotonic
- if (pr->flags.ordered) {
- monotonicity = SCHEDULE_MONOTONIC;
- }
- if (schedule == kmp_sch_static) {
- schedule = __kmp_static;
- } else {
- if (schedule == kmp_sch_runtime) {
- // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
- // not specified)
- schedule = team->t.t_sched.r_sched_type;
- monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
- if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
- monotonicity = SCHEDULE_MONOTONIC;
- // Detail the schedule if needed (global controls are differentiated
- // appropriately)
- if (schedule == kmp_sch_guided_chunked) {
- schedule = __kmp_guided;
- } else if (schedule == kmp_sch_static) {
- schedule = __kmp_static;
- }
- // Use the chunk size specified by OMP_SCHEDULE (or default if not
- // specified)
- chunk = team->t.t_sched.chunk;
- #if USE_ITT_BUILD
- if (cur_chunk)
- *cur_chunk = chunk;
- #endif
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
- "schedule:%%d chunk:%%%s\n",
- traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, schedule, chunk));
- __kmp_str_free(&buff);
- }
- #endif
- } else {
- if (schedule == kmp_sch_guided_chunked) {
- schedule = __kmp_guided;
- }
- if (chunk <= 0) {
- chunk = KMP_DEFAULT_CHUNK;
- }
- }
- if (schedule == kmp_sch_auto) {
- // mapping and differentiation: in the __kmp_do_serial_initialize()
- schedule = __kmp_auto;
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
- "schedule:%%d chunk:%%%s\n",
- traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, schedule, chunk));
- __kmp_str_free(&buff);
- }
- #endif
- }
- #if KMP_STATIC_STEAL_ENABLED
- // map nonmonotonic:dynamic to static steal
- if (schedule == kmp_sch_dynamic_chunked) {
- if (monotonicity == SCHEDULE_NONMONOTONIC)
- schedule = kmp_sch_static_steal;
- }
- #endif
- /* guided analytical not safe for too many threads */
- if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
- schedule = kmp_sch_guided_iterative_chunked;
- KMP_WARNING(DispatchManyThreads);
- }
- if (schedule == kmp_sch_runtime_simd) {
- // compiler provides simd_width in the chunk parameter
- schedule = team->t.t_sched.r_sched_type;
- monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
- // Detail the schedule if needed (global controls are differentiated
- // appropriately)
- if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
- schedule == __kmp_static) {
- schedule = kmp_sch_static_balanced_chunked;
- } else {
- if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
- schedule = kmp_sch_guided_simd;
- }
- chunk = team->t.t_sched.chunk * chunk;
- }
- #if USE_ITT_BUILD
- if (cur_chunk)
- *cur_chunk = chunk;
- #endif
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
- " chunk:%%%s\n",
- traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, schedule, chunk));
- __kmp_str_free(&buff);
- }
- #endif
- }
- pr->u.p.parm1 = chunk;
- }
- KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
- "unknown scheduling type");
- pr->u.p.count = 0;
- if (__kmp_env_consistency_check) {
- if (st == 0) {
- __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
- (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
- }
- }
- // compute trip count
- if (st == 1) { // most common case
- if (ub >= lb) {
- tc = ub - lb + 1;
- } else { // ub < lb
- tc = 0; // zero-trip
- }
- } else if (st < 0) {
- if (lb >= ub) {
- // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
- // where the division needs to be unsigned regardless of the result type
- tc = (UT)(lb - ub) / (-st) + 1;
- } else { // lb < ub
- tc = 0; // zero-trip
- }
- } else { // st > 0
- if (ub >= lb) {
- // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
- // where the division needs to be unsigned regardless of the result type
- tc = (UT)(ub - lb) / st + 1;
- } else { // ub < lb
- tc = 0; // zero-trip
- }
- }
- #if KMP_STATS_ENABLED
- if (KMP_MASTER_GTID(gtid)) {
- KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
- }
- #endif
- pr->u.p.lb = lb;
- pr->u.p.ub = ub;
- pr->u.p.st = st;
- pr->u.p.tc = tc;
- #if KMP_OS_WINDOWS
- pr->u.p.last_upper = ub + st;
- #endif /* KMP_OS_WINDOWS */
- /* NOTE: only the active parallel region(s) has active ordered sections */
- if (active) {
- if (pr->flags.ordered) {
- pr->ordered_bumped = 0;
- pr->u.p.ordered_lower = 1;
- pr->u.p.ordered_upper = 0;
- }
- }
- switch (schedule) {
- #if KMP_STATIC_STEAL_ENABLED
- case kmp_sch_static_steal: {
- T ntc, init;
- KD_TRACE(100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
- gtid));
- ntc = (tc % chunk ? 1 : 0) + tc / chunk;
- if (nproc > 1 && ntc >= nproc) {
- KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
- T id = tid;
- T small_chunk, extras;
- kmp_uint32 old = UNUSED;
- int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
- if (traits_t<T>::type_size > 4) {
- // AC: TODO: check if 16-byte CAS available and use it to
- // improve performance (probably wait for explicit request
- // before spending time on this).
- // For now use dynamically allocated per-private-buffer lock,
- // free memory in __kmp_dispatch_next when status==0.
- pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
- __kmp_init_lock(pr->u.p.steal_lock);
- }
- small_chunk = ntc / nproc;
- extras = ntc % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
- pr->u.p.count = init;
- if (claimed) { // are we succeeded in claiming own buffer?
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
- // Other threads will inspect steal_flag when searching for a victim.
- // READY means other threads may steal from this thread from now on.
- KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
- } else {
- // other thread has stolen whole our range
- KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
- pr->u.p.ub = init; // mark there is no iterations to work on
- }
- pr->u.p.parm2 = ntc; // save number of chunks
- // parm3 is the number of times to attempt stealing which is
- // nproc (just a heuristics, could be optimized later on).
- pr->u.p.parm3 = nproc;
- pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
- break;
- } else {
- /* too few chunks: switching to kmp_sch_dynamic_chunked */
- schedule = kmp_sch_dynamic_chunked;
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
- "kmp_sch_dynamic_chunked\n",
- gtid));
- goto dynamic_init;
- break;
- } // if
- } // case
- #endif
- case kmp_sch_static_balanced: {
- T init, limit;
- KD_TRACE(
- 100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
- gtid));
- if (nproc > 1) {
- T id = tid;
- if (tc < nproc) {
- if (id < tc) {
- init = id;
- limit = id;
- pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
- } else {
- pr->u.p.count = 1; /* means no more chunks to execute */
- pr->u.p.parm1 = FALSE;
- break;
- }
- } else {
- T small_chunk = tc / nproc;
- T extras = tc % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
- limit = init + small_chunk - (id < extras ? 0 : 1);
- pr->u.p.parm1 = (id == nproc - 1);
- }
- } else {
- if (tc > 0) {
- init = 0;
- limit = tc - 1;
- pr->u.p.parm1 = TRUE;
- } else {
- // zero trip count
- pr->u.p.count = 1; /* means no more chunks to execute */
- pr->u.p.parm1 = FALSE;
- break;
- }
- }
- #if USE_ITT_BUILD
- // Calculate chunk for metadata report
- if (itt_need_metadata_reporting)
- if (cur_chunk)
- *cur_chunk = limit - init + 1;
- #endif
- if (st == 1) {
- pr->u.p.lb = lb + init;
- pr->u.p.ub = lb + limit;
- } else {
- // calculated upper bound, "ub" is user-defined upper bound
- T ub_tmp = lb + limit * st;
- pr->u.p.lb = lb + init * st;
- // adjust upper bound to "ub" if needed, so that MS lastprivate will match
- // it exactly
- if (st > 0) {
- pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
- } else {
- pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
- }
- }
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- }
- break;
- } // case
- case kmp_sch_static_balanced_chunked: {
- // similar to balanced, but chunk adjusted to multiple of simd width
- T nth = nproc;
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
- " -> falling-through to static_greedy\n",
- gtid));
- schedule = kmp_sch_static_greedy;
- if (nth > 1)
- pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
- else
- pr->u.p.parm1 = tc;
- break;
- } // case
- case kmp_sch_guided_simd:
- case kmp_sch_guided_iterative_chunked: {
- KD_TRACE(
- 100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
- " case\n",
- gtid));
- if (nproc > 1) {
- if ((2L * chunk + 1) * nproc >= tc) {
- /* chunk size too large, switch to dynamic */
- schedule = kmp_sch_dynamic_chunked;
- goto dynamic_init;
- } else {
- // when remaining iters become less than parm2 - switch to dynamic
- pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
- *(double *)&pr->u.p.parm3 =
- guided_flt_param / (double)nproc; // may occupy parm3 and parm4
- }
- } else {
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
- "kmp_sch_static_greedy\n",
- gtid));
- schedule = kmp_sch_static_greedy;
- /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
- KD_TRACE(
- 100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
- gtid));
- pr->u.p.parm1 = tc;
- } // if
- } // case
- break;
- case kmp_sch_guided_analytical_chunked: {
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
- "kmp_sch_guided_analytical_chunked case\n",
- gtid));
- if (nproc > 1) {
- if ((2L * chunk + 1) * nproc >= tc) {
- /* chunk size too large, switch to dynamic */
- schedule = kmp_sch_dynamic_chunked;
- goto dynamic_init;
- } else {
- /* commonly used term: (2 nproc - 1)/(2 nproc) */
- DBL x;
- #if KMP_USE_X87CONTROL
- /* Linux* OS already has 64-bit computation by default for long double,
- and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
- Windows* OS on IA-32 architecture, we need to set precision to 64-bit
- instead of the default 53-bit. Even though long double doesn't work
- on Windows* OS on Intel(R) 64, the resulting lack of precision is not
- expected to impact the correctness of the algorithm, but this has not
- been mathematically proven. */
- // save original FPCW and set precision to 64-bit, as
- // Windows* OS on IA-32 architecture defaults to 53-bit
- unsigned int oldFpcw = _control87(0, 0);
- _control87(_PC_64, _MCW_PC); // 0,0x30000
- #endif
- /* value used for comparison in solver for cross-over point */
- KMP_ASSERT(tc > 0);
- long double target = ((long double)chunk * 2 + 1) * nproc / tc;
- /* crossover point--chunk indexes equal to or greater than
- this point switch to dynamic-style scheduling */
- UT cross;
- /* commonly used term: (2 nproc - 1)/(2 nproc) */
- x = 1.0 - 0.5 / (double)nproc;
- #ifdef KMP_DEBUG
- { // test natural alignment
- struct _test_a {
- char a;
- union {
- char b;
- DBL d;
- };
- } t;
- ptrdiff_t natural_alignment =
- (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
- //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
- // long)natural_alignment );
- KMP_DEBUG_ASSERT(
- (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
- }
- #endif // KMP_DEBUG
- /* save the term in thread private dispatch structure */
- *(DBL *)&pr->u.p.parm3 = x;
- /* solve for the crossover point to the nearest integer i for which C_i
- <= chunk */
- {
- UT left, right, mid;
- long double p;
- /* estimate initial upper and lower bound */
- /* doesn't matter what value right is as long as it is positive, but
- it affects performance of the solver */
- right = 229;
- p = __kmp_pow<UT>(x, right);
- if (p > target) {
- do {
- p *= p;
- right <<= 1;
- } while (p > target && right < (1 << 27));
- /* lower bound is previous (failed) estimate of upper bound */
- left = right >> 1;
- } else {
- left = 0;
- }
- /* bisection root-finding method */
- while (left + 1 < right) {
- mid = (left + right) / 2;
- if (__kmp_pow<UT>(x, mid) > target) {
- left = mid;
- } else {
- right = mid;
- }
- } // while
- cross = right;
- }
- /* assert sanity of computed crossover point */
- KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
- __kmp_pow<UT>(x, cross) <= target);
- /* save the crossover point in thread private dispatch structure */
- pr->u.p.parm2 = cross;
- // C75803
- #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
- #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
- #else
- #define GUIDED_ANALYTICAL_WORKAROUND (x)
- #endif
- /* dynamic-style scheduling offset */
- pr->u.p.count = tc -
- __kmp_dispatch_guided_remaining(
- tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
- cross * chunk;
- #if KMP_USE_X87CONTROL
- // restore FPCW
- _control87(oldFpcw, _MCW_PC);
- #endif
- } // if
- } else {
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
- "kmp_sch_static_greedy\n",
- gtid));
- schedule = kmp_sch_static_greedy;
- /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
- pr->u.p.parm1 = tc;
- } // if
- } // case
- break;
- case kmp_sch_static_greedy:
- KD_TRACE(
- 100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
- gtid));
- pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
- break;
- case kmp_sch_static_chunked:
- case kmp_sch_dynamic_chunked:
- dynamic_init:
- if (tc == 0)
- break;
- if (pr->u.p.parm1 <= 0)
- pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
- else if (pr->u.p.parm1 > tc)
- pr->u.p.parm1 = tc;
- // Store the total number of chunks to prevent integer overflow during
- // bounds calculations in the get next chunk routine.
- pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
- KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
- "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
- gtid));
- break;
- case kmp_sch_trapezoidal: {
- /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
- T parm1, parm2, parm3, parm4;
- KD_TRACE(100,
- ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
- gtid));
- parm1 = chunk;
- /* F : size of the first cycle */
- parm2 = (tc / (2 * nproc));
- if (parm2 < 1) {
- parm2 = 1;
- }
- /* L : size of the last cycle. Make sure the last cycle is not larger
- than the first cycle. */
- if (parm1 < 1) {
- parm1 = 1;
- } else if (parm1 > parm2) {
- parm1 = parm2;
- }
- /* N : number of cycles */
- parm3 = (parm2 + parm1);
- parm3 = (2 * tc + parm3 - 1) / parm3;
- if (parm3 < 2) {
- parm3 = 2;
- }
- /* sigma : decreasing incr of the trapezoid */
- parm4 = (parm3 - 1);
- parm4 = (parm2 - parm1) / parm4;
- // pointless check, because parm4 >= 0 always
- // if ( parm4 < 0 ) {
- // parm4 = 0;
- //}
- pr->u.p.parm1 = parm1;
- pr->u.p.parm2 = parm2;
- pr->u.p.parm3 = parm3;
- pr->u.p.parm4 = parm4;
- } // case
- break;
- default: {
- __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
- KMP_HNT(GetNewerLibrary), // Hint
- __kmp_msg_null // Variadic argument list terminator
- );
- } break;
- } // switch
- pr->schedule = schedule;
- }
- #if KMP_USE_HIER_SCHED
- template <typename T>
- inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
- typename traits_t<T>::signed_t st);
- template <>
- inline void
- __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
- kmp_int32 ub, kmp_int32 st) {
- __kmp_dispatch_init_hierarchy<kmp_int32>(
- loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
- __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
- }
- template <>
- inline void
- __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
- kmp_uint32 ub, kmp_int32 st) {
- __kmp_dispatch_init_hierarchy<kmp_uint32>(
- loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
- __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
- }
- template <>
- inline void
- __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
- kmp_int64 ub, kmp_int64 st) {
- __kmp_dispatch_init_hierarchy<kmp_int64>(
- loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
- __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
- }
- template <>
- inline void
- __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
- kmp_uint64 ub, kmp_int64 st) {
- __kmp_dispatch_init_hierarchy<kmp_uint64>(
- loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
- __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
- }
- // free all the hierarchy scheduling memory associated with the team
- void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
- int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
- for (int i = 0; i < num_disp_buff; ++i) {
- // type does not matter here so use kmp_int32
- auto sh =
- reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
- &team->t.t_disp_buffer[i]);
- if (sh->hier) {
- sh->hier->deallocate();
- __kmp_free(sh->hier);
- }
- }
- }
- #endif
- // UT - unsigned flavor of T, ST - signed flavor of T,
- // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
- template <typename T>
- static void
- __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
- T ub, typename traits_t<T>::signed_t st,
- typename traits_t<T>::signed_t chunk, int push_ws) {
- typedef typename traits_t<T>::unsigned_t UT;
- int active;
- kmp_info_t *th;
- kmp_team_t *team;
- kmp_uint32 my_buffer_index;
- dispatch_private_info_template<T> *pr;
- dispatch_shared_info_template<T> volatile *sh;
- KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
- sizeof(dispatch_private_info));
- KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
- sizeof(dispatch_shared_info));
- __kmp_assert_valid_gtid(gtid);
- if (!TCR_4(__kmp_init_parallel))
- __kmp_parallel_initialize();
- __kmp_resume_if_soft_paused();
- #if INCLUDE_SSC_MARKS
- SSC_MARK_DISPATCH_INIT();
- #endif
- #ifdef KMP_DEBUG
- typedef typename traits_t<T>::signed_t ST;
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
- "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
- traits_t<ST>::spec, traits_t<T>::spec,
- traits_t<T>::spec, traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
- __kmp_str_free(&buff);
- }
- #endif
- /* setup data */
- th = __kmp_threads[gtid];
- team = th->th.th_team;
- active = !team->t.t_serialized;
- th->th.th_ident = loc;
- // Any half-decent optimizer will remove this test when the blocks are empty
- // since the macros expand to nothing
- // when statistics are disabled.
- if (schedule == __kmp_static) {
- KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
- } else {
- KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
- }
- #if KMP_USE_HIER_SCHED
- // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
- // Hierarchical scheduling does not work with ordered, so if ordered is
- // detected, then revert back to threaded scheduling.
- bool ordered;
- enum sched_type my_sched = schedule;
- my_buffer_index = th->th.th_dispatch->th_disp_index;
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- &th->th.th_dispatch
- ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
- if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
- my_sched =
- (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
- ordered = (kmp_ord_lower & my_sched);
- if (pr->flags.use_hier) {
- if (ordered) {
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
- "Disabling hierarchical scheduling.\n",
- gtid));
- pr->flags.use_hier = FALSE;
- }
- }
- if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
- // Don't use hierarchical for ordered parallel loops and don't
- // use the runtime hierarchy if one was specified in the program
- if (!ordered && !pr->flags.use_hier)
- __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
- }
- #endif // KMP_USE_HIER_SCHED
- #if USE_ITT_BUILD
- kmp_uint64 cur_chunk = chunk;
- int itt_need_metadata_reporting =
- __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
- KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
- team->t.t_active_level == 1;
- #endif
- if (!active) {
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- th->th.th_dispatch->th_disp_buffer); /* top of the stack */
- } else {
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- my_buffer_index = th->th.th_dispatch->th_disp_index++;
- /* What happens when number of threads changes, need to resize buffer? */
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- &th->th.th_dispatch
- ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
- &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
- my_buffer_index));
- if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
- " sh->buffer_index:%d\n",
- gtid, my_buffer_index, sh->buffer_index));
- __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
- __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
- // Note: KMP_WAIT() cannot be used there: buffer index and
- // my_buffer_index are *always* 32-bit integers.
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
- "sh->buffer_index:%d\n",
- gtid, my_buffer_index, sh->buffer_index));
- }
- }
- __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
- #if USE_ITT_BUILD
- &cur_chunk,
- #endif
- chunk, (T)th->th.th_team_nproc,
- (T)th->th.th_info.ds.ds_tid);
- if (active) {
- if (pr->flags.ordered == 0) {
- th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
- th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
- } else {
- th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
- th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
- }
- th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
- th->th.th_dispatch->th_dispatch_sh_current =
- CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
- #if USE_ITT_BUILD
- if (pr->flags.ordered) {
- __kmp_itt_ordered_init(gtid);
- }
- // Report loop metadata
- if (itt_need_metadata_reporting) {
- // Only report metadata by primary thread of active team at level 1
- kmp_uint64 schedtype = 0;
- switch (schedule) {
- case kmp_sch_static_chunked:
- case kmp_sch_static_balanced: // Chunk is calculated in the switch above
- break;
- case kmp_sch_static_greedy:
- cur_chunk = pr->u.p.parm1;
- break;
- case kmp_sch_dynamic_chunked:
- schedtype = 1;
- break;
- case kmp_sch_guided_iterative_chunked:
- case kmp_sch_guided_analytical_chunked:
- case kmp_sch_guided_simd:
- schedtype = 2;
- break;
- default:
- // Should we put this case under "static"?
- // case kmp_sch_static_steal:
- schedtype = 3;
- break;
- }
- __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
- }
- #if KMP_USE_HIER_SCHED
- if (pr->flags.use_hier) {
- pr->u.p.count = 0;
- pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
- }
- #endif // KMP_USER_HIER_SCHED
- #endif /* USE_ITT_BUILD */
- }
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
- "lb:%%%s ub:%%%s"
- " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
- " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
- traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
- traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
- traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
- pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
- pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
- pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
- __kmp_str_free(&buff);
- }
- #endif
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- if (ompt_enabled.ompt_callback_work) {
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
- ompt_callbacks.ompt_callback(ompt_callback_work)(
- ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
- &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
- }
- #endif
- KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
- }
- /* For ordered loops, either __kmp_dispatch_finish() should be called after
- * every iteration, or __kmp_dispatch_finish_chunk() should be called after
- * every chunk of iterations. If the ordered section(s) were not executed
- * for this iteration (or every iteration in this chunk), we need to set the
- * ordered iteration counters so that the next thread can proceed. */
- template <typename UT>
- static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
- typedef typename traits_t<UT>::signed_t ST;
- __kmp_assert_valid_gtid(gtid);
- kmp_info_t *th = __kmp_threads[gtid];
- KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
- if (!th->th.th_team->t.t_serialized) {
- dispatch_private_info_template<UT> *pr =
- reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- dispatch_shared_info_template<UT> volatile *sh =
- reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
- th->th.th_dispatch->th_dispatch_sh_current);
- KMP_DEBUG_ASSERT(pr);
- KMP_DEBUG_ASSERT(sh);
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- if (pr->ordered_bumped) {
- KD_TRACE(
- 1000,
- ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
- gtid));
- pr->ordered_bumped = 0;
- } else {
- UT lower = pr->u.p.ordered_lower;
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
- "ordered_iteration:%%%s lower:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
- __kmp_str_free(&buff);
- }
- #endif
- __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
- KMP_MB(); /* is this necessary? */
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
- "ordered_iteration:%%%s lower:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
- __kmp_str_free(&buff);
- }
- #endif
- test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
- } // if
- } // if
- KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
- }
- #ifdef KMP_GOMP_COMPAT
- template <typename UT>
- static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
- typedef typename traits_t<UT>::signed_t ST;
- __kmp_assert_valid_gtid(gtid);
- kmp_info_t *th = __kmp_threads[gtid];
- KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
- if (!th->th.th_team->t.t_serialized) {
- dispatch_private_info_template<UT> *pr =
- reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- dispatch_shared_info_template<UT> volatile *sh =
- reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
- th->th.th_dispatch->th_dispatch_sh_current);
- KMP_DEBUG_ASSERT(pr);
- KMP_DEBUG_ASSERT(sh);
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- UT lower = pr->u.p.ordered_lower;
- UT upper = pr->u.p.ordered_upper;
- UT inc = upper - lower + 1;
- if (pr->ordered_bumped == inc) {
- KD_TRACE(
- 1000,
- ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
- gtid));
- pr->ordered_bumped = 0;
- } else {
- inc -= pr->ordered_bumped;
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_finish_chunk: T#%%d before wait: "
- "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
- __kmp_str_free(&buff);
- }
- #endif
- __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
- KMP_MB(); /* is this necessary? */
- KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
- "ordered_bumped to zero\n",
- gtid));
- pr->ordered_bumped = 0;
- //!!!!! TODO check if the inc should be unsigned, or signed???
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_finish_chunk: T#%%d after wait: "
- "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
- traits_t<UT>::spec);
- KD_TRACE(1000,
- (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
- __kmp_str_free(&buff);
- }
- #endif
- test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
- }
- // }
- }
- KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
- }
- #endif /* KMP_GOMP_COMPAT */
- template <typename T>
- int __kmp_dispatch_next_algorithm(int gtid,
- dispatch_private_info_template<T> *pr,
- dispatch_shared_info_template<T> volatile *sh,
- kmp_int32 *p_last, T *p_lb, T *p_ub,
- typename traits_t<T>::signed_t *p_st, T nproc,
- T tid) {
- typedef typename traits_t<T>::unsigned_t UT;
- typedef typename traits_t<T>::signed_t ST;
- typedef typename traits_t<T>::floating_t DBL;
- int status = 0;
- bool last = false;
- T start;
- ST incr;
- UT limit, trip, init;
- kmp_info_t *th = __kmp_threads[gtid];
- kmp_team_t *team = th->th.th_team;
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- KMP_DEBUG_ASSERT(pr);
- KMP_DEBUG_ASSERT(sh);
- KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff =
- __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
- "sh:%%p nproc:%%%s tid:%%%s\n",
- traits_t<T>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
- __kmp_str_free(&buff);
- }
- #endif
- // zero trip count
- if (pr->u.p.tc == 0) {
- KD_TRACE(10,
- ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
- "zero status:%d\n",
- gtid, status));
- return 0;
- }
- switch (pr->schedule) {
- #if KMP_STATIC_STEAL_ENABLED
- case kmp_sch_static_steal: {
- T chunk = pr->u.p.parm1;
- UT nchunks = pr->u.p.parm2;
- KD_TRACE(100,
- ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
- gtid));
- trip = pr->u.p.tc - 1;
- if (traits_t<T>::type_size > 4) {
- // use lock for 8-byte induction variable.
- // TODO (optional): check presence and use 16-byte CAS
- kmp_lock_t *lck = pr->u.p.steal_lock;
- KMP_DEBUG_ASSERT(lck != NULL);
- if (pr->u.p.count < (UT)pr->u.p.ub) {
- KMP_DEBUG_ASSERT(pr->steal_flag == READY);
- __kmp_acquire_lock(lck, gtid);
- // try to get own chunk of iterations
- init = (pr->u.p.count)++;
- status = (init < (UT)pr->u.p.ub);
- __kmp_release_lock(lck, gtid);
- } else {
- status = 0; // no own chunks
- }
- if (!status) { // try to steal
- kmp_lock_t *lckv; // victim buffer's lock
- T while_limit = pr->u.p.parm3;
- T while_index = 0;
- int idx = (th->th.th_dispatch->th_disp_index - 1) %
- __kmp_dispatch_num_buffers; // current loop index
- // note: victim thread can potentially execute another loop
- KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
- while ((!status) && (while_limit != ++while_index)) {
- dispatch_private_info_template<T> *v;
- T remaining;
- T victimId = pr->u.p.parm4;
- T oldVictimId = victimId ? victimId - 1 : nproc - 1;
- v = reinterpret_cast<dispatch_private_info_template<T> *>(
- &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
- KMP_DEBUG_ASSERT(v);
- while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
- oldVictimId != victimId) {
- victimId = (victimId + 1) % nproc;
- v = reinterpret_cast<dispatch_private_info_template<T> *>(
- &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
- KMP_DEBUG_ASSERT(v);
- }
- if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
- continue; // try once more (nproc attempts in total)
- }
- if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
- kmp_uint32 old = UNUSED;
- // try to steal whole range from inactive victim
- status = v->steal_flag.compare_exchange_strong(old, THIEF);
- if (status) {
- // initialize self buffer with victim's whole range of chunks
- T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
- __kmp_acquire_lock(lck, gtid);
- pr->u.p.count = init + 1; // exclude one we execute immediately
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
- __kmp_release_lock(lck, gtid);
- pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
- // no need to reinitialize other thread invariants: lb, st, etc.
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
- __kmp_str_free(&buff);
- }
- #endif
- // activate non-empty buffer and let others steal from us
- if (pr->u.p.count < (UT)pr->u.p.ub)
- KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
- break;
- }
- }
- if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
- v->u.p.count >= (UT)v->u.p.ub) {
- pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
- continue; // no chunks to steal, try next victim
- }
- lckv = v->u.p.steal_lock;
- KMP_ASSERT(lckv != NULL);
- __kmp_acquire_lock(lckv, gtid);
- limit = v->u.p.ub; // keep initial ub
- if (v->u.p.count >= limit) {
- __kmp_release_lock(lckv, gtid);
- pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
- continue; // no chunks to steal, try next victim
- }
- // stealing succeded, reduce victim's ub by 1/4 of undone chunks
- // TODO: is this heuristics good enough??
- remaining = limit - v->u.p.count;
- if (remaining > 7) {
- // steal 1/4 of remaining
- KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
- init = (v->u.p.ub -= (remaining >> 2));
- } else {
- // steal 1 chunk of 1..7 remaining
- KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
- init = (v->u.p.ub -= 1);
- }
- __kmp_release_lock(lckv, gtid);
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(10, (buff, gtid, victimId, init, limit));
- __kmp_str_free(&buff);
- }
- #endif
- KMP_DEBUG_ASSERT(init + 1 <= limit);
- pr->u.p.parm4 = victimId; // remember victim to steal from
- status = 1;
- // now update own count and ub with stolen range excluding init chunk
- __kmp_acquire_lock(lck, gtid);
- pr->u.p.count = init + 1;
- pr->u.p.ub = limit;
- __kmp_release_lock(lck, gtid);
- // activate non-empty buffer and let others steal from us
- if (init + 1 < limit)
- KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
- } // while (search for victim)
- } // if (try to find victim and steal)
- } else {
- // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
- // as all operations on pair (count, ub) must be done atomically
- typedef union {
- struct {
- UT count;
- T ub;
- } p;
- kmp_int64 b;
- } union_i4;
- union_i4 vold, vnew;
- if (pr->u.p.count < (UT)pr->u.p.ub) {
- KMP_DEBUG_ASSERT(pr->steal_flag == READY);
- vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
- vnew.b = vold.b;
- vnew.p.count++; // get chunk from head of self range
- while (!KMP_COMPARE_AND_STORE_REL64(
- (volatile kmp_int64 *)&pr->u.p.count,
- *VOLATILE_CAST(kmp_int64 *) & vold.b,
- *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
- KMP_CPU_PAUSE();
- vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
- vnew.b = vold.b;
- vnew.p.count++;
- }
- init = vold.p.count;
- status = (init < (UT)vold.p.ub);
- } else {
- status = 0; // no own chunks
- }
- if (!status) { // try to steal
- T while_limit = pr->u.p.parm3;
- T while_index = 0;
- int idx = (th->th.th_dispatch->th_disp_index - 1) %
- __kmp_dispatch_num_buffers; // current loop index
- // note: victim thread can potentially execute another loop
- KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
- while ((!status) && (while_limit != ++while_index)) {
- dispatch_private_info_template<T> *v;
- T remaining;
- T victimId = pr->u.p.parm4;
- T oldVictimId = victimId ? victimId - 1 : nproc - 1;
- v = reinterpret_cast<dispatch_private_info_template<T> *>(
- &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
- KMP_DEBUG_ASSERT(v);
- while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
- oldVictimId != victimId) {
- victimId = (victimId + 1) % nproc;
- v = reinterpret_cast<dispatch_private_info_template<T> *>(
- &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
- KMP_DEBUG_ASSERT(v);
- }
- if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
- continue; // try once more (nproc attempts in total)
- }
- if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
- kmp_uint32 old = UNUSED;
- // try to steal whole range from inactive victim
- status = v->steal_flag.compare_exchange_strong(old, THIEF);
- if (status) {
- // initialize self buffer with victim's whole range of chunks
- T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
- vnew.p.count = init + 1;
- vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
- // write pair (count, ub) at once atomically
- #if KMP_ARCH_X86
- KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
- #else
- *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
- #endif
- pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
- // no need to initialize other thread invariants: lb, st, etc.
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
- __kmp_str_free(&buff);
- }
- #endif
- // activate non-empty buffer and let others steal from us
- if (pr->u.p.count < (UT)pr->u.p.ub)
- KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
- break;
- }
- }
- while (1) { // CAS loop with check if victim still has enough chunks
- // many threads may be stealing concurrently from same victim
- vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
- if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
- vold.p.count >= (UT)vold.p.ub) {
- pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
- break; // no chunks to steal, try next victim
- }
- vnew.b = vold.b;
- remaining = vold.p.ub - vold.p.count;
- // try to steal 1/4 of remaining
- // TODO: is this heuristics good enough??
- if (remaining > 7) {
- vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
- } else {
- vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
- }
- KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
- if (KMP_COMPARE_AND_STORE_REL64(
- (volatile kmp_int64 *)&v->u.p.count,
- *VOLATILE_CAST(kmp_int64 *) & vold.b,
- *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
- // stealing succedded
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<T>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
- __kmp_str_free(&buff);
- }
- #endif
- KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
- vold.p.ub - vnew.p.ub);
- status = 1;
- pr->u.p.parm4 = victimId; // keep victim id
- // now update own count and ub
- init = vnew.p.ub;
- vold.p.count = init + 1;
- #if KMP_ARCH_X86
- KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
- #else
- *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
- #endif
- // activate non-empty buffer and let others steal from us
- if (vold.p.count < (UT)vold.p.ub)
- KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
- break;
- } // if (check CAS result)
- KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
- } // while (try to steal from particular victim)
- } // while (search for victim)
- } // if (try to find victim and steal)
- } // if (4-byte induction variable)
- if (!status) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- start = pr->u.p.lb;
- init *= chunk;
- limit = chunk + init - 1;
- incr = pr->u.p.st;
- KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
- KMP_DEBUG_ASSERT(init <= trip);
- // keep track of done chunks for possible early exit from stealing
- // TODO: count executed chunks locally with rare update of shared location
- // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- if (p_st != NULL)
- *p_st = incr;
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
- } // if
- break;
- } // case
- #endif // KMP_STATIC_STEAL_ENABLED
- case kmp_sch_static_balanced: {
- KD_TRACE(
- 10,
- ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
- gtid));
- /* check if thread has any iteration to do */
- if ((status = !pr->u.p.count) != 0) {
- pr->u.p.count = 1;
- *p_lb = pr->u.p.lb;
- *p_ub = pr->u.p.ub;
- last = (pr->u.p.parm1 != 0);
- if (p_st != NULL)
- *p_st = pr->u.p.st;
- } else { /* no iterations to do */
- pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
- }
- } // case
- break;
- case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
- merged here */
- case kmp_sch_static_chunked: {
- T parm1;
- KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
- "kmp_sch_static_[affinity|chunked] case\n",
- gtid));
- parm1 = pr->u.p.parm1;
- trip = pr->u.p.tc - 1;
- init = parm1 * (pr->u.p.count + tid);
- if ((status = (init <= trip)) != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- limit = parm1 + init - 1;
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- if (p_st != NULL)
- *p_st = incr;
- pr->u.p.count += nproc;
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- } // if
- } // if
- } // case
- break;
- case kmp_sch_dynamic_chunked: {
- UT chunk_number;
- UT chunk_size = pr->u.p.parm1;
- UT nchunks = pr->u.p.parm2;
- KD_TRACE(
- 100,
- ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
- gtid));
- chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
- status = (chunk_number < nchunks);
- if (!status) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- init = chunk_size * chunk_number;
- trip = pr->u.p.tc - 1;
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if ((last = (trip - init < (UT)chunk_size)))
- limit = trip;
- else
- limit = chunk_size + init - 1;
- if (p_st != NULL)
- *p_st = incr;
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- } // if
- } // if
- } // case
- break;
- case kmp_sch_guided_iterative_chunked: {
- T chunkspec = pr->u.p.parm1;
- KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
- "iterative case\n",
- gtid));
- trip = pr->u.p.tc;
- // Start atomic part of calculations
- while (1) {
- ST remaining; // signed, because can be < 0
- init = sh->u.s.iteration; // shared value
- remaining = trip - init;
- if (remaining <= 0) { // AC: need to compare with 0 first
- // nothing to do, don't try atomic op
- status = 0;
- break;
- }
- if ((T)remaining <
- pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
- // use dynamic-style schedule
- // atomically increment iterations, get old value
- init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)chunkspec);
- remaining = trip - init;
- if (remaining <= 0) {
- status = 0; // all iterations got by other threads
- } else {
- // got some iterations to work on
- status = 1;
- if ((T)remaining > chunkspec) {
- limit = init + chunkspec - 1;
- } else {
- last = true; // the last chunk
- limit = init + remaining - 1;
- } // if
- } // if
- break;
- } // if
- limit = init + (UT)((double)remaining *
- *(double *)&pr->u.p.parm3); // divide by K*nproc
- if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)init, (ST)limit)) {
- // CAS was successful, chunk obtained
- status = 1;
- --limit;
- break;
- } // if
- } // while
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- } // if
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } // if
- } // case
- break;
- case kmp_sch_guided_simd: {
- // same as iterative but curr-chunk adjusted to be multiple of given
- // chunk
- T chunk = pr->u.p.parm1;
- KD_TRACE(100,
- ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
- gtid));
- trip = pr->u.p.tc;
- // Start atomic part of calculations
- while (1) {
- ST remaining; // signed, because can be < 0
- init = sh->u.s.iteration; // shared value
- remaining = trip - init;
- if (remaining <= 0) { // AC: need to compare with 0 first
- status = 0; // nothing to do, don't try atomic op
- break;
- }
- KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
- // compare with K*nproc*(chunk+1), K=2 by default
- if ((T)remaining < pr->u.p.parm2) {
- // use dynamic-style schedule
- // atomically increment iterations, get old value
- init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)chunk);
- remaining = trip - init;
- if (remaining <= 0) {
- status = 0; // all iterations got by other threads
- } else {
- // got some iterations to work on
- status = 1;
- if ((T)remaining > chunk) {
- limit = init + chunk - 1;
- } else {
- last = true; // the last chunk
- limit = init + remaining - 1;
- } // if
- } // if
- break;
- } // if
- // divide by K*nproc
- UT span;
- __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
- &span);
- UT rem = span % chunk;
- if (rem) // adjust so that span%chunk == 0
- span += chunk - rem;
- limit = init + span;
- if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)init, (ST)limit)) {
- // CAS was successful, chunk obtained
- status = 1;
- --limit;
- break;
- } // if
- } // while
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- } // if
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } // if
- } // case
- break;
- case kmp_sch_guided_analytical_chunked: {
- T chunkspec = pr->u.p.parm1;
- UT chunkIdx;
- #if KMP_USE_X87CONTROL
- /* for storing original FPCW value for Windows* OS on
- IA-32 architecture 8-byte version */
- unsigned int oldFpcw;
- unsigned int fpcwSet = 0;
- #endif
- KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
- "kmp_sch_guided_analytical_chunked case\n",
- gtid));
- trip = pr->u.p.tc;
- KMP_DEBUG_ASSERT(nproc > 1);
- KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
- while (1) { /* this while loop is a safeguard against unexpected zero
- chunk sizes */
- chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
- if (chunkIdx >= (UT)pr->u.p.parm2) {
- --trip;
- /* use dynamic-style scheduling */
- init = chunkIdx * chunkspec + pr->u.p.count;
- /* need to verify init > 0 in case of overflow in the above
- * calculation */
- if ((status = (init > 0 && init <= trip)) != 0) {
- limit = init + chunkspec - 1;
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- }
- break;
- } else {
- /* use exponential-style scheduling */
- /* The following check is to workaround the lack of long double precision on
- Windows* OS.
- This check works around the possible effect that init != 0 for chunkIdx == 0.
- */
- #if KMP_USE_X87CONTROL
- /* If we haven't already done so, save original
- FPCW and set precision to 64-bit, as Windows* OS
- on IA-32 architecture defaults to 53-bit */
- if (!fpcwSet) {
- oldFpcw = _control87(0, 0);
- _control87(_PC_64, _MCW_PC);
- fpcwSet = 0x30000;
- }
- #endif
- if (chunkIdx) {
- init = __kmp_dispatch_guided_remaining<T>(
- trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
- KMP_DEBUG_ASSERT(init);
- init = trip - init;
- } else
- init = 0;
- limit = trip - __kmp_dispatch_guided_remaining<T>(
- trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
- KMP_ASSERT(init <= limit);
- if (init < limit) {
- KMP_DEBUG_ASSERT(limit <= trip);
- --limit;
- status = 1;
- break;
- } // if
- } // if
- } // while (1)
- #if KMP_USE_X87CONTROL
- /* restore FPCW if necessary
- AC: check fpcwSet flag first because oldFpcw can be uninitialized here
- */
- if (fpcwSet && (oldFpcw & fpcwSet))
- _control87(oldFpcw, _MCW_PC);
- #endif
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- }
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- }
- } // case
- break;
- case kmp_sch_trapezoidal: {
- UT index;
- T parm2 = pr->u.p.parm2;
- T parm3 = pr->u.p.parm3;
- T parm4 = pr->u.p.parm4;
- KD_TRACE(100,
- ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
- gtid));
- index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
- init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
- trip = pr->u.p.tc - 1;
- if ((status = ((T)index < parm3 && init <= trip)) == 0) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- start = pr->u.p.lb;
- limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
- incr = pr->u.p.st;
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- if (p_st != NULL)
- *p_st = incr;
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- } // if
- } // if
- } // case
- break;
- default: {
- status = 0; // to avoid complaints on uninitialized variable use
- __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
- KMP_HNT(GetNewerLibrary), // Hint
- __kmp_msg_null // Variadic argument list terminator
- );
- } break;
- } // switch
- if (p_last)
- *p_last = last;
- #ifdef KMP_DEBUG
- if (pr->flags.ordered) {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
- "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
- traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
- KMP_DEBUG_ASSERT(p_last);
- KMP_DEBUG_ASSERT(p_st);
- KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
- __kmp_str_free(&buff);
- }
- #endif
- return status;
- }
- /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
- work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
- is not called. */
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- #define OMPT_LOOP_END \
- if (status == 0) { \
- if (ompt_enabled.ompt_callback_work) { \
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
- ompt_callbacks.ompt_callback(ompt_callback_work)( \
- ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
- &(task_info->task_data), 0, codeptr); \
- } \
- }
- #define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
- if (ompt_enabled.ompt_callback_dispatch && status) { \
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
- ompt_dispatch_chunk_t chunk; \
- ompt_data_t instance = ompt_data_none; \
- OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
- instance.ptr = &chunk; \
- ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
- &(team_info->parallel_data), &(task_info->task_data), \
- ompt_dispatch_ws_loop_chunk, instance); \
- }
- // TODO: implement count
- #else
- #define OMPT_LOOP_END // no-op
- #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
- #endif
- #if KMP_STATS_ENABLED
- #define KMP_STATS_LOOP_END \
- { \
- kmp_int64 u, l, t, i; \
- l = (kmp_int64)(*p_lb); \
- u = (kmp_int64)(*p_ub); \
- i = (kmp_int64)(pr->u.p.st); \
- if (status == 0) { \
- t = 0; \
- KMP_POP_PARTITIONED_TIMER(); \
- } else if (i == 1) { \
- if (u >= l) \
- t = u - l + 1; \
- else \
- t = 0; \
- } else if (i < 0) { \
- if (l >= u) \
- t = (l - u) / (-i) + 1; \
- else \
- t = 0; \
- } else { \
- if (u >= l) \
- t = (u - l) / i + 1; \
- else \
- t = 0; \
- } \
- KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
- }
- #else
- #define KMP_STATS_LOOP_END /* Nothing */
- #endif
- template <typename T>
- static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
- T *p_lb, T *p_ub,
- typename traits_t<T>::signed_t *p_st
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- ,
- void *codeptr
- #endif
- ) {
- typedef typename traits_t<T>::unsigned_t UT;
- typedef typename traits_t<T>::signed_t ST;
- // This is potentially slightly misleading, schedule(runtime) will appear here
- // even if the actual runtime schedule is static. (Which points out a
- // disadvantage of schedule(runtime): even when static scheduling is used it
- // costs more than a compile time choice to use static scheduling would.)
- KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
- int status;
- dispatch_private_info_template<T> *pr;
- __kmp_assert_valid_gtid(gtid);
- kmp_info_t *th = __kmp_threads[gtid];
- kmp_team_t *team = th->th.th_team;
- KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
- KD_TRACE(
- 1000,
- ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
- gtid, p_lb, p_ub, p_st, p_last));
- if (team->t.t_serialized) {
- /* NOTE: serialize this dispatch because we are not at the active level */
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- th->th.th_dispatch->th_disp_buffer); /* top of the stack */
- KMP_DEBUG_ASSERT(pr);
- if ((status = (pr->u.p.tc != 0)) == 0) {
- *p_lb = 0;
- *p_ub = 0;
- // if ( p_last != NULL )
- // *p_last = 0;
- if (p_st != NULL)
- *p_st = 0;
- if (__kmp_env_consistency_check) {
- if (pr->pushed_ws != ct_none) {
- pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
- }
- }
- } else if (pr->flags.nomerge) {
- kmp_int32 last;
- T start;
- UT limit, trip, init;
- ST incr;
- T chunk = pr->u.p.parm1;
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
- gtid));
- init = chunk * pr->u.p.count++;
- trip = pr->u.p.tc - 1;
- if ((status = (init <= trip)) == 0) {
- *p_lb = 0;
- *p_ub = 0;
- // if ( p_last != NULL )
- // *p_last = 0;
- if (p_st != NULL)
- *p_st = 0;
- if (__kmp_env_consistency_check) {
- if (pr->pushed_ws != ct_none) {
- pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
- }
- }
- } else {
- start = pr->u.p.lb;
- limit = chunk + init - 1;
- incr = pr->u.p.st;
- if ((last = (limit >= trip)) != 0) {
- limit = trip;
- #if KMP_OS_WINDOWS
- pr->u.p.last_upper = pr->u.p.ub;
- #endif /* KMP_OS_WINDOWS */
- }
- if (p_last != NULL)
- *p_last = last;
- if (p_st != NULL)
- *p_st = incr;
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
- if (pr->flags.ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
- #endif
- } // if
- } // if
- } else {
- pr->u.p.tc = 0;
- *p_lb = pr->u.p.lb;
- *p_ub = pr->u.p.ub;
- #if KMP_OS_WINDOWS
- pr->u.p.last_upper = *p_ub;
- #endif /* KMP_OS_WINDOWS */
- if (p_last != NULL)
- *p_last = TRUE;
- if (p_st != NULL)
- *p_st = pr->u.p.st;
- } // if
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
- "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
- traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
- (p_last ? *p_last : 0), status));
- __kmp_str_free(&buff);
- }
- #endif
- #if INCLUDE_SSC_MARKS
- SSC_MARK_DISPATCH_NEXT();
- #endif
- OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
- OMPT_LOOP_END;
- KMP_STATS_LOOP_END;
- return status;
- } else {
- kmp_int32 last = 0;
- dispatch_shared_info_template<T> volatile *sh;
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- KMP_DEBUG_ASSERT(pr);
- sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
- th->th.th_dispatch->th_dispatch_sh_current);
- KMP_DEBUG_ASSERT(sh);
- #if KMP_USE_HIER_SCHED
- if (pr->flags.use_hier)
- status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
- else
- #endif // KMP_USE_HIER_SCHED
- status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
- p_st, th->th.th_team_nproc,
- th->th.th_info.ds.ds_tid);
- // status == 0: no more iterations to execute
- if (status == 0) {
- ST num_done;
- num_done = test_then_inc<ST>(&sh->u.s.num_done);
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
- traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
- __kmp_str_free(&buff);
- }
- #endif
- #if KMP_USE_HIER_SCHED
- pr->flags.use_hier = FALSE;
- #endif
- if (num_done == th->th.th_team_nproc - 1) {
- #if KMP_STATIC_STEAL_ENABLED
- if (pr->schedule == kmp_sch_static_steal) {
- int i;
- int idx = (th->th.th_dispatch->th_disp_index - 1) %
- __kmp_dispatch_num_buffers; // current loop index
- // loop complete, safe to destroy locks used for stealing
- for (i = 0; i < th->th.th_team_nproc; ++i) {
- dispatch_private_info_template<T> *buf =
- reinterpret_cast<dispatch_private_info_template<T> *>(
- &team->t.t_dispatch[i].th_disp_buffer[idx]);
- KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
- KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
- if (traits_t<T>::type_size > 4) {
- // destroy locks used for stealing
- kmp_lock_t *lck = buf->u.p.steal_lock;
- KMP_ASSERT(lck != NULL);
- __kmp_destroy_lock(lck);
- __kmp_free(lck);
- buf->u.p.steal_lock = NULL;
- }
- }
- }
- #endif
- /* NOTE: release shared buffer to be reused */
- KMP_MB(); /* Flush all pending memory write invalidates. */
- sh->u.s.num_done = 0;
- sh->u.s.iteration = 0;
- /* TODO replace with general release procedure? */
- if (pr->flags.ordered) {
- sh->u.s.ordered_iteration = 0;
- }
- sh->buffer_index += __kmp_dispatch_num_buffers;
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
- gtid, sh->buffer_index));
- KMP_MB(); /* Flush all pending memory write invalidates. */
- } // if
- if (__kmp_env_consistency_check) {
- if (pr->pushed_ws != ct_none) {
- pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
- }
- }
- th->th.th_dispatch->th_deo_fcn = NULL;
- th->th.th_dispatch->th_dxo_fcn = NULL;
- th->th.th_dispatch->th_dispatch_sh_current = NULL;
- th->th.th_dispatch->th_dispatch_pr_current = NULL;
- } // if (status == 0)
- #if KMP_OS_WINDOWS
- else if (last) {
- pr->u.p.last_upper = pr->u.p.ub;
- }
- #endif /* KMP_OS_WINDOWS */
- if (p_last != NULL && status != 0)
- *p_last = last;
- } // if
- #ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d normal case: "
- "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
- traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
- (p_last ? *p_last : 0), status));
- __kmp_str_free(&buff);
- }
- #endif
- #if INCLUDE_SSC_MARKS
- SSC_MARK_DISPATCH_NEXT();
- #endif
- OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
- OMPT_LOOP_END;
- KMP_STATS_LOOP_END;
- return status;
- }
- /*!
- @ingroup WORK_SHARING
- @param loc source location information
- @param global_tid global thread number
- @return Zero if the parallel region is not active and this thread should execute
- all sections, non-zero otherwise.
- Beginning of sections construct.
- There are no implicit barriers in the "sections" calls, rather the compiler
- should introduce an explicit barrier if it is required.
- This implementation is based on __kmp_dispatch_init, using same constructs for
- shared data (we can't have sections nested directly in omp for loop, there
- should be a parallel region in between)
- */
- kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
- int active;
- kmp_info_t *th;
- kmp_team_t *team;
- kmp_uint32 my_buffer_index;
- dispatch_shared_info_template<kmp_int32> volatile *sh;
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- if (!TCR_4(__kmp_init_parallel))
- __kmp_parallel_initialize();
- __kmp_resume_if_soft_paused();
- /* setup data */
- th = __kmp_threads[gtid];
- team = th->th.th_team;
- active = !team->t.t_serialized;
- th->th.th_ident = loc;
- KMP_COUNT_BLOCK(OMP_SECTIONS);
- KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
- if (active) {
- // Setup sections in the same way as dynamic scheduled loops.
- // We need one shared data: which section is to execute next.
- // (in case parallel is not active, all sections will be executed on the
- // same thread)
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- my_buffer_index = th->th.th_dispatch->th_disp_index++;
- // reuse shared data structures from dynamic sched loops:
- sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
- &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
- my_buffer_index));
- th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
- th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
- KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
- "sh->buffer_index:%d\n",
- gtid, my_buffer_index, sh->buffer_index));
- __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
- __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
- // Note: KMP_WAIT() cannot be used there: buffer index and
- // my_buffer_index are *always* 32-bit integers.
- KMP_MB();
- KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
- "sh->buffer_index:%d\n",
- gtid, my_buffer_index, sh->buffer_index));
- th->th.th_dispatch->th_dispatch_pr_current =
- nullptr; // sections construct doesn't need private data
- th->th.th_dispatch->th_dispatch_sh_current =
- CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
- }
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- if (ompt_enabled.ompt_callback_work) {
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
- ompt_callbacks.ompt_callback(ompt_callback_work)(
- ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
- &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
- }
- #endif
- KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
- return active;
- }
- /*!
- @ingroup WORK_SHARING
- @param loc source location information
- @param global_tid global thread number
- @param numberOfSections number of sections in the 'sections' construct
- @return unsigned [from 0 to n) - number (id) of the section to execute next on
- this thread. n (or any other number not in range) - nothing to execute on this
- thread
- */
- kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
- kmp_int32 numberOfSections) {
- KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
- kmp_info_t *th = __kmp_threads[gtid];
- #ifdef KMP_DEBUG
- kmp_team_t *team = th->th.th_team;
- #endif
- KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
- numberOfSections));
- // For serialized case we should not call this function:
- KMP_DEBUG_ASSERT(!team->t.t_serialized);
- dispatch_shared_info_template<kmp_int32> volatile *sh;
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
- KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
- sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
- th->th.th_dispatch->th_dispatch_sh_current);
- KMP_DEBUG_ASSERT(sh);
- kmp_int32 sectionIndex = 0;
- bool moreSectionsToExecute = true;
- // Find section to execute:
- sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
- if (sectionIndex >= numberOfSections) {
- moreSectionsToExecute = false;
- }
- // status == 0: no more sections to execute;
- // OMPTODO: __kmpc_end_sections could be bypassed?
- if (!moreSectionsToExecute) {
- kmp_int32 num_done;
- num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
- if (num_done == th->th.th_team_nproc - 1) {
- /* NOTE: release this buffer to be reused */
- KMP_MB(); /* Flush all pending memory write invalidates. */
- sh->u.s.num_done = 0;
- sh->u.s.iteration = 0;
- KMP_MB(); /* Flush all pending memory write invalidates. */
- sh->buffer_index += __kmp_dispatch_num_buffers;
- KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
- sh->buffer_index));
- KMP_MB(); /* Flush all pending memory write invalidates. */
- } // if
- th->th.th_dispatch->th_deo_fcn = NULL;
- th->th.th_dispatch->th_dxo_fcn = NULL;
- th->th.th_dispatch->th_dispatch_sh_current = NULL;
- th->th.th_dispatch->th_dispatch_pr_current = NULL;
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- if (ompt_enabled.ompt_callback_dispatch) {
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
- ompt_data_t instance = ompt_data_none;
- instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
- ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
- &(team_info->parallel_data), &(task_info->task_data),
- ompt_dispatch_section, instance);
- }
- #endif
- KMP_POP_PARTITIONED_TIMER();
- }
- return sectionIndex;
- }
- /*!
- @ingroup WORK_SHARING
- @param loc source location information
- @param global_tid global thread number
- End of "sections" construct.
- Don't need to wait here: barrier is added separately when needed.
- */
- void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
- kmp_info_t *th = __kmp_threads[gtid];
- int active = !th->th.th_team->t.t_serialized;
- KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
- if (!active) {
- // In active case call finalization is done in __kmpc_next_section
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- if (ompt_enabled.ompt_callback_work) {
- ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
- ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
- ompt_callbacks.ompt_callback(ompt_callback_work)(
- ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
- &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
- }
- #endif
- KMP_POP_PARTITIONED_TIMER();
- }
- KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
- }
- template <typename T>
- static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
- kmp_int32 *plastiter, T *plower, T *pupper,
- typename traits_t<T>::signed_t incr) {
- typedef typename traits_t<T>::unsigned_t UT;
- kmp_uint32 team_id;
- kmp_uint32 nteams;
- UT trip_count;
- kmp_team_t *team;
- kmp_info_t *th;
- KMP_DEBUG_ASSERT(plastiter && plower && pupper);
- KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
- #ifdef KMP_DEBUG
- typedef typename traits_t<T>::signed_t ST;
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
- "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
- traits_t<T>::spec, traits_t<T>::spec,
- traits_t<ST>::spec, traits_t<T>::spec);
- KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
- __kmp_str_free(&buff);
- }
- #endif
- if (__kmp_env_consistency_check) {
- if (incr == 0) {
- __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
- loc);
- }
- if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
- // The loop is illegal.
- // Some zero-trip loops maintained by compiler, e.g.:
- // for(i=10;i<0;++i) // lower >= upper - run-time check
- // for(i=0;i>10;--i) // lower <= upper - run-time check
- // for(i=0;i>10;++i) // incr > 0 - compile-time check
- // for(i=10;i<0;--i) // incr < 0 - compile-time check
- // Compiler does not check the following illegal loops:
- // for(i=0;i<10;i+=incr) // where incr<0
- // for(i=10;i>0;i-=incr) // where incr<0
- __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
- }
- }
- __kmp_assert_valid_gtid(gtid);
- th = __kmp_threads[gtid];
- team = th->th.th_team;
- KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
- nteams = th->th.th_teams_size.nteams;
- team_id = team->t.t_master_tid;
- KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
- // compute global trip count
- if (incr == 1) {
- trip_count = *pupper - *plower + 1;
- } else if (incr == -1) {
- trip_count = *plower - *pupper + 1;
- } else if (incr > 0) {
- // upper-lower can exceed the limit of signed type
- trip_count = (UT)(*pupper - *plower) / incr + 1;
- } else {
- trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
- }
- if (trip_count <= nteams) {
- KMP_DEBUG_ASSERT(
- __kmp_static == kmp_sch_static_greedy ||
- __kmp_static ==
- kmp_sch_static_balanced); // Unknown static scheduling type.
- // only some teams get single iteration, others get nothing
- if (team_id < trip_count) {
- *pupper = *plower = *plower + team_id * incr;
- } else {
- *plower = *pupper + incr; // zero-trip loop
- }
- if (plastiter != NULL)
- *plastiter = (team_id == trip_count - 1);
- } else {
- if (__kmp_static == kmp_sch_static_balanced) {
- UT chunk = trip_count / nteams;
- UT extras = trip_count % nteams;
- *plower +=
- incr * (team_id * chunk + (team_id < extras ? team_id : extras));
- *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
- if (plastiter != NULL)
- *plastiter = (team_id == nteams - 1);
- } else {
- T chunk_inc_count =
- (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
- T upper = *pupper;
- KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
- // Unknown static scheduling type.
- *plower += team_id * chunk_inc_count;
- *pupper = *plower + chunk_inc_count - incr;
- // Check/correct bounds if needed
- if (incr > 0) {
- if (*pupper < *plower)
- *pupper = traits_t<T>::max_value;
- if (plastiter != NULL)
- *plastiter = *plower <= upper && *pupper > upper - incr;
- if (*pupper > upper)
- *pupper = upper; // tracker C73258
- } else {
- if (*pupper > *plower)
- *pupper = traits_t<T>::min_value;
- if (plastiter != NULL)
- *plastiter = *plower >= upper && *pupper < upper - incr;
- if (*pupper < upper)
- *pupper = upper; // tracker C73258
- }
- }
- }
- }
- //-----------------------------------------------------------------------------
- // Dispatch routines
- // Transfer call to template< type T >
- // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
- // T lb, T ub, ST st, ST chunk )
- extern "C" {
- /*!
- @ingroup WORK_SHARING
- @{
- @param loc Source location
- @param gtid Global thread id
- @param schedule Schedule type
- @param lb Lower bound
- @param ub Upper bound
- @param st Step (or increment if you prefer)
- @param chunk The chunk size to block with
- This function prepares the runtime to start a dynamically scheduled for loop,
- saving the loop arguments.
- These functions are all identical apart from the types of the arguments.
- */
- void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 lb,
- kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- /*!
- See @ref __kmpc_dispatch_init_4
- */
- void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_uint32 lb,
- kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- /*!
- See @ref __kmpc_dispatch_init_4
- */
- void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int64 lb,
- kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- /*!
- See @ref __kmpc_dispatch_init_4
- */
- void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_uint64 lb,
- kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- /*!
- See @ref __kmpc_dispatch_init_4
- Difference from __kmpc_dispatch_init set of functions is these functions
- are called for composite distribute parallel for construct. Thus before
- regular iterations dispatching we need to calc per-team iteration space.
- These functions are all identical apart from the types of the arguments.
- */
- void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 *p_last,
- kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
- kmp_int32 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
- __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 *p_last,
- kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
- kmp_int32 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
- __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 *p_last,
- kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
- kmp_int64 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
- __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 *p_last,
- kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
- kmp_int64 chunk) {
- KMP_DEBUG_ASSERT(__kmp_init_serial);
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
- __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
- }
- /*!
- @param loc Source code location
- @param gtid Global thread id
- @param p_last Pointer to a flag set to one if this is the last chunk or zero
- otherwise
- @param p_lb Pointer to the lower bound for the next chunk of work
- @param p_ub Pointer to the upper bound for the next chunk of work
- @param p_st Pointer to the stride for the next chunk of work
- @return one if there is work to be done, zero otherwise
- Get the next dynamically allocated chunk of work for this thread.
- If there is no more work, then the lb,ub and stride need not be modified.
- */
- int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
- kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- ,
- OMPT_LOAD_RETURN_ADDRESS(gtid)
- #endif
- );
- }
- /*!
- See @ref __kmpc_dispatch_next_4
- */
- int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
- kmp_uint32 *p_lb, kmp_uint32 *p_ub,
- kmp_int32 *p_st) {
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- ,
- OMPT_LOAD_RETURN_ADDRESS(gtid)
- #endif
- );
- }
- /*!
- See @ref __kmpc_dispatch_next_4
- */
- int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
- kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- ,
- OMPT_LOAD_RETURN_ADDRESS(gtid)
- #endif
- );
- }
- /*!
- See @ref __kmpc_dispatch_next_4
- */
- int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
- kmp_uint64 *p_lb, kmp_uint64 *p_ub,
- kmp_int64 *p_st) {
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- OMPT_STORE_RETURN_ADDRESS(gtid);
- #endif
- return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
- #if OMPT_SUPPORT && OMPT_OPTIONAL
- ,
- OMPT_LOAD_RETURN_ADDRESS(gtid)
- #endif
- );
- }
- /*!
- @param loc Source code location
- @param gtid Global thread id
- Mark the end of a dynamic loop.
- */
- void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
- }
- /*!
- See @ref __kmpc_dispatch_fini_4
- */
- void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
- }
- /*!
- See @ref __kmpc_dispatch_fini_4
- */
- void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
- }
- /*!
- See @ref __kmpc_dispatch_fini_4
- */
- void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
- }
- /*! @} */
- //-----------------------------------------------------------------------------
- // Non-template routines from kmp_dispatch.cpp used in other sources
- kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
- return value == checker;
- }
- kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
- return value != checker;
- }
- kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
- return value < checker;
- }
- kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
- return value >= checker;
- }
- kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
- return value <= checker;
- }
- kmp_uint32
- __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
- kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
- void *obj // Higher-level synchronization object, or NULL.
- ) {
- // note: we may not belong to a team at this point
- volatile kmp_uint32 *spin = spinner;
- kmp_uint32 check = checker;
- kmp_uint32 spins;
- kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
- kmp_uint32 r;
- kmp_uint64 time;
- KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
- KMP_INIT_YIELD(spins);
- KMP_INIT_BACKOFF(time);
- // main wait spin loop
- while (!f(r = TCR_4(*spin), check)) {
- KMP_FSYNC_SPIN_PREPARE(obj);
- /* GEH - remove this since it was accidentally introduced when kmp_wait was
- split. It causes problems with infinite recursion because of exit lock */
- /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
- __kmp_abort_thread(); */
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
- }
- KMP_FSYNC_SPIN_ACQUIRED(obj);
- return r;
- }
- void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
- kmp_uint32 (*pred)(void *, kmp_uint32),
- void *obj // Higher-level synchronization object, or NULL.
- ) {
- // note: we may not belong to a team at this point
- void *spin = spinner;
- kmp_uint32 check = checker;
- kmp_uint32 spins;
- kmp_uint32 (*f)(void *, kmp_uint32) = pred;
- kmp_uint64 time;
- KMP_FSYNC_SPIN_INIT(obj, spin);
- KMP_INIT_YIELD(spins);
- KMP_INIT_BACKOFF(time);
- // main wait spin loop
- while (!f(spin, check)) {
- KMP_FSYNC_SPIN_PREPARE(obj);
- /* if we have waited a bit, or are noversubscribed, yield */
- /* pause is in the following code */
- KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
- }
- KMP_FSYNC_SPIN_ACQUIRED(obj);
- }
- } // extern "C"
- #ifdef KMP_GOMP_COMPAT
- void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int32 lb,
- kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
- int push_ws) {
- __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
- push_ws);
- }
- void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_uint32 lb,
- kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
- int push_ws) {
- __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
- push_ws);
- }
- void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_int64 lb,
- kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
- int push_ws) {
- __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
- push_ws);
- }
- void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
- enum sched_type schedule, kmp_uint64 lb,
- kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
- int push_ws) {
- __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
- push_ws);
- }
- void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
- }
- void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
- }
- void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
- }
- void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
- __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
- }
- #endif /* KMP_GOMP_COMPAT */
- /* ------------------------------------------------------------------------ */
|