kmp_dispatch.cpp 104 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982
  1. /*
  2. * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
  3. */
  4. //===----------------------------------------------------------------------===//
  5. //
  6. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  7. // See https://llvm.org/LICENSE.txt for license information.
  8. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  9. //
  10. //===----------------------------------------------------------------------===//
  11. /* Dynamic scheduling initialization and dispatch.
  12. *
  13. * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
  14. * it may change values between parallel regions. __kmp_max_nth
  15. * is the largest value __kmp_nth may take, 1 is the smallest.
  16. */
  17. #include "kmp.h"
  18. #include "kmp_error.h"
  19. #include "kmp_i18n.h"
  20. #include "kmp_itt.h"
  21. #include "kmp_stats.h"
  22. #include "kmp_str.h"
  23. #if KMP_USE_X87CONTROL
  24. #include <float.h>
  25. #endif
  26. #include "kmp_lock.h"
  27. #include "kmp_dispatch.h"
  28. #if KMP_USE_HIER_SCHED
  29. #error #include "kmp_dispatch_hier.h"
  30. #endif
  31. #if OMPT_SUPPORT
  32. #include "ompt-specific.h"
  33. #endif
  34. /* ------------------------------------------------------------------------ */
  35. /* ------------------------------------------------------------------------ */
  36. void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  37. kmp_info_t *th;
  38. KMP_DEBUG_ASSERT(gtid_ref);
  39. if (__kmp_env_consistency_check) {
  40. th = __kmp_threads[*gtid_ref];
  41. if (th->th.th_root->r.r_active &&
  42. (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
  43. #if KMP_USE_DYNAMIC_LOCK
  44. __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
  45. #else
  46. __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
  47. #endif
  48. }
  49. }
  50. }
  51. void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  52. kmp_info_t *th;
  53. if (__kmp_env_consistency_check) {
  54. th = __kmp_threads[*gtid_ref];
  55. if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
  56. __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
  57. }
  58. }
  59. }
  60. // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
  61. static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
  62. bool use_hier = false) {
  63. // Pick up the nonmonotonic/monotonic bits from the scheduling type
  64. // Nonmonotonic as default for dynamic schedule when no modifier is specified
  65. int monotonicity = SCHEDULE_NONMONOTONIC;
  66. // Let default be monotonic for executables
  67. // compiled with OpenMP* 4.5 or less compilers
  68. if (loc != NULL && loc->get_openmp_version() < 50)
  69. monotonicity = SCHEDULE_MONOTONIC;
  70. if (use_hier || __kmp_force_monotonic)
  71. monotonicity = SCHEDULE_MONOTONIC;
  72. else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
  73. monotonicity = SCHEDULE_NONMONOTONIC;
  74. else if (SCHEDULE_HAS_MONOTONIC(schedule))
  75. monotonicity = SCHEDULE_MONOTONIC;
  76. return monotonicity;
  77. }
  78. #if KMP_STATIC_STEAL_ENABLED
  79. enum { // values for steal_flag (possible states of private per-loop buffer)
  80. UNUSED = 0,
  81. CLAIMED = 1, // owner thread started initialization
  82. READY = 2, // available for stealing
  83. THIEF = 3 // finished by owner, or claimed by thief
  84. // possible state changes:
  85. // 0 -> 1 owner only, sync
  86. // 0 -> 3 thief only, sync
  87. // 1 -> 2 owner only, async
  88. // 2 -> 3 owner only, async
  89. // 3 -> 2 owner only, async
  90. // 3 -> 0 last thread finishing the loop, async
  91. };
  92. #endif
  93. // Initialize a dispatch_private_info_template<T> buffer for a particular
  94. // type of schedule,chunk. The loop description is found in lb (lower bound),
  95. // ub (upper bound), and st (stride). nproc is the number of threads relevant
  96. // to the scheduling (often the number of threads in a team, but not always if
  97. // hierarchical scheduling is used). tid is the id of the thread calling
  98. // the function within the group of nproc threads. It will have a value
  99. // between 0 and nproc - 1. This is often just the thread id within a team, but
  100. // is not necessarily the case when using hierarchical scheduling.
  101. // loc is the source file location of the corresponding loop
  102. // gtid is the global thread id
  103. template <typename T>
  104. void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  105. dispatch_private_info_template<T> *pr,
  106. enum sched_type schedule, T lb, T ub,
  107. typename traits_t<T>::signed_t st,
  108. #if USE_ITT_BUILD
  109. kmp_uint64 *cur_chunk,
  110. #endif
  111. typename traits_t<T>::signed_t chunk,
  112. T nproc, T tid) {
  113. typedef typename traits_t<T>::unsigned_t UT;
  114. typedef typename traits_t<T>::floating_t DBL;
  115. int active;
  116. T tc;
  117. kmp_info_t *th;
  118. kmp_team_t *team;
  119. int monotonicity;
  120. bool use_hier;
  121. #ifdef KMP_DEBUG
  122. typedef typename traits_t<T>::signed_t ST;
  123. {
  124. char *buff;
  125. // create format specifiers before the debug output
  126. buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
  127. "pr:%%p lb:%%%s ub:%%%s st:%%%s "
  128. "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
  129. traits_t<T>::spec, traits_t<T>::spec,
  130. traits_t<ST>::spec, traits_t<ST>::spec,
  131. traits_t<T>::spec, traits_t<T>::spec);
  132. KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
  133. __kmp_str_free(&buff);
  134. }
  135. #endif
  136. /* setup data */
  137. th = __kmp_threads[gtid];
  138. team = th->th.th_team;
  139. active = !team->t.t_serialized;
  140. #if USE_ITT_BUILD
  141. int itt_need_metadata_reporting =
  142. __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
  143. KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
  144. team->t.t_active_level == 1;
  145. #endif
  146. #if KMP_USE_HIER_SCHED
  147. use_hier = pr->flags.use_hier;
  148. #else
  149. use_hier = false;
  150. #endif
  151. /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
  152. monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
  153. schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
  154. /* Pick up the nomerge/ordered bits from the scheduling type */
  155. if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
  156. pr->flags.nomerge = TRUE;
  157. schedule =
  158. (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
  159. } else {
  160. pr->flags.nomerge = FALSE;
  161. }
  162. pr->type_size = traits_t<T>::type_size; // remember the size of variables
  163. if (kmp_ord_lower & schedule) {
  164. pr->flags.ordered = TRUE;
  165. schedule =
  166. (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
  167. } else {
  168. pr->flags.ordered = FALSE;
  169. }
  170. // Ordered overrides nonmonotonic
  171. if (pr->flags.ordered) {
  172. monotonicity = SCHEDULE_MONOTONIC;
  173. }
  174. if (schedule == kmp_sch_static) {
  175. schedule = __kmp_static;
  176. } else {
  177. if (schedule == kmp_sch_runtime) {
  178. // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
  179. // not specified)
  180. schedule = team->t.t_sched.r_sched_type;
  181. monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
  182. schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
  183. if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
  184. monotonicity = SCHEDULE_MONOTONIC;
  185. // Detail the schedule if needed (global controls are differentiated
  186. // appropriately)
  187. if (schedule == kmp_sch_guided_chunked) {
  188. schedule = __kmp_guided;
  189. } else if (schedule == kmp_sch_static) {
  190. schedule = __kmp_static;
  191. }
  192. // Use the chunk size specified by OMP_SCHEDULE (or default if not
  193. // specified)
  194. chunk = team->t.t_sched.chunk;
  195. #if USE_ITT_BUILD
  196. if (cur_chunk)
  197. *cur_chunk = chunk;
  198. #endif
  199. #ifdef KMP_DEBUG
  200. {
  201. char *buff;
  202. // create format specifiers before the debug output
  203. buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
  204. "schedule:%%d chunk:%%%s\n",
  205. traits_t<ST>::spec);
  206. KD_TRACE(10, (buff, gtid, schedule, chunk));
  207. __kmp_str_free(&buff);
  208. }
  209. #endif
  210. } else {
  211. if (schedule == kmp_sch_guided_chunked) {
  212. schedule = __kmp_guided;
  213. }
  214. if (chunk <= 0) {
  215. chunk = KMP_DEFAULT_CHUNK;
  216. }
  217. }
  218. if (schedule == kmp_sch_auto) {
  219. // mapping and differentiation: in the __kmp_do_serial_initialize()
  220. schedule = __kmp_auto;
  221. #ifdef KMP_DEBUG
  222. {
  223. char *buff;
  224. // create format specifiers before the debug output
  225. buff = __kmp_str_format(
  226. "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
  227. "schedule:%%d chunk:%%%s\n",
  228. traits_t<ST>::spec);
  229. KD_TRACE(10, (buff, gtid, schedule, chunk));
  230. __kmp_str_free(&buff);
  231. }
  232. #endif
  233. }
  234. #if KMP_STATIC_STEAL_ENABLED
  235. // map nonmonotonic:dynamic to static steal
  236. if (schedule == kmp_sch_dynamic_chunked) {
  237. if (monotonicity == SCHEDULE_NONMONOTONIC)
  238. schedule = kmp_sch_static_steal;
  239. }
  240. #endif
  241. /* guided analytical not safe for too many threads */
  242. if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
  243. schedule = kmp_sch_guided_iterative_chunked;
  244. KMP_WARNING(DispatchManyThreads);
  245. }
  246. if (schedule == kmp_sch_runtime_simd) {
  247. // compiler provides simd_width in the chunk parameter
  248. schedule = team->t.t_sched.r_sched_type;
  249. monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
  250. schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
  251. // Detail the schedule if needed (global controls are differentiated
  252. // appropriately)
  253. if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
  254. schedule == __kmp_static) {
  255. schedule = kmp_sch_static_balanced_chunked;
  256. } else {
  257. if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
  258. schedule = kmp_sch_guided_simd;
  259. }
  260. chunk = team->t.t_sched.chunk * chunk;
  261. }
  262. #if USE_ITT_BUILD
  263. if (cur_chunk)
  264. *cur_chunk = chunk;
  265. #endif
  266. #ifdef KMP_DEBUG
  267. {
  268. char *buff;
  269. // create format specifiers before the debug output
  270. buff = __kmp_str_format(
  271. "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
  272. " chunk:%%%s\n",
  273. traits_t<ST>::spec);
  274. KD_TRACE(10, (buff, gtid, schedule, chunk));
  275. __kmp_str_free(&buff);
  276. }
  277. #endif
  278. }
  279. pr->u.p.parm1 = chunk;
  280. }
  281. KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
  282. "unknown scheduling type");
  283. pr->u.p.count = 0;
  284. if (__kmp_env_consistency_check) {
  285. if (st == 0) {
  286. __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
  287. (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
  288. }
  289. }
  290. // compute trip count
  291. if (st == 1) { // most common case
  292. if (ub >= lb) {
  293. tc = ub - lb + 1;
  294. } else { // ub < lb
  295. tc = 0; // zero-trip
  296. }
  297. } else if (st < 0) {
  298. if (lb >= ub) {
  299. // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
  300. // where the division needs to be unsigned regardless of the result type
  301. tc = (UT)(lb - ub) / (-st) + 1;
  302. } else { // lb < ub
  303. tc = 0; // zero-trip
  304. }
  305. } else { // st > 0
  306. if (ub >= lb) {
  307. // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
  308. // where the division needs to be unsigned regardless of the result type
  309. tc = (UT)(ub - lb) / st + 1;
  310. } else { // ub < lb
  311. tc = 0; // zero-trip
  312. }
  313. }
  314. #if KMP_STATS_ENABLED
  315. if (KMP_MASTER_GTID(gtid)) {
  316. KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
  317. }
  318. #endif
  319. pr->u.p.lb = lb;
  320. pr->u.p.ub = ub;
  321. pr->u.p.st = st;
  322. pr->u.p.tc = tc;
  323. #if KMP_OS_WINDOWS
  324. pr->u.p.last_upper = ub + st;
  325. #endif /* KMP_OS_WINDOWS */
  326. /* NOTE: only the active parallel region(s) has active ordered sections */
  327. if (active) {
  328. if (pr->flags.ordered) {
  329. pr->ordered_bumped = 0;
  330. pr->u.p.ordered_lower = 1;
  331. pr->u.p.ordered_upper = 0;
  332. }
  333. }
  334. switch (schedule) {
  335. #if KMP_STATIC_STEAL_ENABLED
  336. case kmp_sch_static_steal: {
  337. T ntc, init;
  338. KD_TRACE(100,
  339. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
  340. gtid));
  341. ntc = (tc % chunk ? 1 : 0) + tc / chunk;
  342. if (nproc > 1 && ntc >= nproc) {
  343. KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
  344. T id = tid;
  345. T small_chunk, extras;
  346. kmp_uint32 old = UNUSED;
  347. int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
  348. if (traits_t<T>::type_size > 4) {
  349. // AC: TODO: check if 16-byte CAS available and use it to
  350. // improve performance (probably wait for explicit request
  351. // before spending time on this).
  352. // For now use dynamically allocated per-private-buffer lock,
  353. // free memory in __kmp_dispatch_next when status==0.
  354. pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
  355. __kmp_init_lock(pr->u.p.steal_lock);
  356. }
  357. small_chunk = ntc / nproc;
  358. extras = ntc % nproc;
  359. init = id * small_chunk + (id < extras ? id : extras);
  360. pr->u.p.count = init;
  361. if (claimed) { // are we succeeded in claiming own buffer?
  362. pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
  363. // Other threads will inspect steal_flag when searching for a victim.
  364. // READY means other threads may steal from this thread from now on.
  365. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
  366. } else {
  367. // other thread has stolen whole our range
  368. KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
  369. pr->u.p.ub = init; // mark there is no iterations to work on
  370. }
  371. pr->u.p.parm2 = ntc; // save number of chunks
  372. // parm3 is the number of times to attempt stealing which is
  373. // nproc (just a heuristics, could be optimized later on).
  374. pr->u.p.parm3 = nproc;
  375. pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
  376. break;
  377. } else {
  378. /* too few chunks: switching to kmp_sch_dynamic_chunked */
  379. schedule = kmp_sch_dynamic_chunked;
  380. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
  381. "kmp_sch_dynamic_chunked\n",
  382. gtid));
  383. goto dynamic_init;
  384. break;
  385. } // if
  386. } // case
  387. #endif
  388. case kmp_sch_static_balanced: {
  389. T init, limit;
  390. KD_TRACE(
  391. 100,
  392. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
  393. gtid));
  394. if (nproc > 1) {
  395. T id = tid;
  396. if (tc < nproc) {
  397. if (id < tc) {
  398. init = id;
  399. limit = id;
  400. pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
  401. } else {
  402. pr->u.p.count = 1; /* means no more chunks to execute */
  403. pr->u.p.parm1 = FALSE;
  404. break;
  405. }
  406. } else {
  407. T small_chunk = tc / nproc;
  408. T extras = tc % nproc;
  409. init = id * small_chunk + (id < extras ? id : extras);
  410. limit = init + small_chunk - (id < extras ? 0 : 1);
  411. pr->u.p.parm1 = (id == nproc - 1);
  412. }
  413. } else {
  414. if (tc > 0) {
  415. init = 0;
  416. limit = tc - 1;
  417. pr->u.p.parm1 = TRUE;
  418. } else {
  419. // zero trip count
  420. pr->u.p.count = 1; /* means no more chunks to execute */
  421. pr->u.p.parm1 = FALSE;
  422. break;
  423. }
  424. }
  425. #if USE_ITT_BUILD
  426. // Calculate chunk for metadata report
  427. if (itt_need_metadata_reporting)
  428. if (cur_chunk)
  429. *cur_chunk = limit - init + 1;
  430. #endif
  431. if (st == 1) {
  432. pr->u.p.lb = lb + init;
  433. pr->u.p.ub = lb + limit;
  434. } else {
  435. // calculated upper bound, "ub" is user-defined upper bound
  436. T ub_tmp = lb + limit * st;
  437. pr->u.p.lb = lb + init * st;
  438. // adjust upper bound to "ub" if needed, so that MS lastprivate will match
  439. // it exactly
  440. if (st > 0) {
  441. pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
  442. } else {
  443. pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
  444. }
  445. }
  446. if (pr->flags.ordered) {
  447. pr->u.p.ordered_lower = init;
  448. pr->u.p.ordered_upper = limit;
  449. }
  450. break;
  451. } // case
  452. case kmp_sch_static_balanced_chunked: {
  453. // similar to balanced, but chunk adjusted to multiple of simd width
  454. T nth = nproc;
  455. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
  456. " -> falling-through to static_greedy\n",
  457. gtid));
  458. schedule = kmp_sch_static_greedy;
  459. if (nth > 1)
  460. pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
  461. else
  462. pr->u.p.parm1 = tc;
  463. break;
  464. } // case
  465. case kmp_sch_guided_simd:
  466. case kmp_sch_guided_iterative_chunked: {
  467. KD_TRACE(
  468. 100,
  469. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
  470. " case\n",
  471. gtid));
  472. if (nproc > 1) {
  473. if ((2L * chunk + 1) * nproc >= tc) {
  474. /* chunk size too large, switch to dynamic */
  475. schedule = kmp_sch_dynamic_chunked;
  476. goto dynamic_init;
  477. } else {
  478. // when remaining iters become less than parm2 - switch to dynamic
  479. pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
  480. *(double *)&pr->u.p.parm3 =
  481. guided_flt_param / (double)nproc; // may occupy parm3 and parm4
  482. }
  483. } else {
  484. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
  485. "kmp_sch_static_greedy\n",
  486. gtid));
  487. schedule = kmp_sch_static_greedy;
  488. /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
  489. KD_TRACE(
  490. 100,
  491. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
  492. gtid));
  493. pr->u.p.parm1 = tc;
  494. } // if
  495. } // case
  496. break;
  497. case kmp_sch_guided_analytical_chunked: {
  498. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
  499. "kmp_sch_guided_analytical_chunked case\n",
  500. gtid));
  501. if (nproc > 1) {
  502. if ((2L * chunk + 1) * nproc >= tc) {
  503. /* chunk size too large, switch to dynamic */
  504. schedule = kmp_sch_dynamic_chunked;
  505. goto dynamic_init;
  506. } else {
  507. /* commonly used term: (2 nproc - 1)/(2 nproc) */
  508. DBL x;
  509. #if KMP_USE_X87CONTROL
  510. /* Linux* OS already has 64-bit computation by default for long double,
  511. and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
  512. Windows* OS on IA-32 architecture, we need to set precision to 64-bit
  513. instead of the default 53-bit. Even though long double doesn't work
  514. on Windows* OS on Intel(R) 64, the resulting lack of precision is not
  515. expected to impact the correctness of the algorithm, but this has not
  516. been mathematically proven. */
  517. // save original FPCW and set precision to 64-bit, as
  518. // Windows* OS on IA-32 architecture defaults to 53-bit
  519. unsigned int oldFpcw = _control87(0, 0);
  520. _control87(_PC_64, _MCW_PC); // 0,0x30000
  521. #endif
  522. /* value used for comparison in solver for cross-over point */
  523. KMP_ASSERT(tc > 0);
  524. long double target = ((long double)chunk * 2 + 1) * nproc / tc;
  525. /* crossover point--chunk indexes equal to or greater than
  526. this point switch to dynamic-style scheduling */
  527. UT cross;
  528. /* commonly used term: (2 nproc - 1)/(2 nproc) */
  529. x = 1.0 - 0.5 / (double)nproc;
  530. #ifdef KMP_DEBUG
  531. { // test natural alignment
  532. struct _test_a {
  533. char a;
  534. union {
  535. char b;
  536. DBL d;
  537. };
  538. } t;
  539. ptrdiff_t natural_alignment =
  540. (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
  541. //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
  542. // long)natural_alignment );
  543. KMP_DEBUG_ASSERT(
  544. (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
  545. }
  546. #endif // KMP_DEBUG
  547. /* save the term in thread private dispatch structure */
  548. *(DBL *)&pr->u.p.parm3 = x;
  549. /* solve for the crossover point to the nearest integer i for which C_i
  550. <= chunk */
  551. {
  552. UT left, right, mid;
  553. long double p;
  554. /* estimate initial upper and lower bound */
  555. /* doesn't matter what value right is as long as it is positive, but
  556. it affects performance of the solver */
  557. right = 229;
  558. p = __kmp_pow<UT>(x, right);
  559. if (p > target) {
  560. do {
  561. p *= p;
  562. right <<= 1;
  563. } while (p > target && right < (1 << 27));
  564. /* lower bound is previous (failed) estimate of upper bound */
  565. left = right >> 1;
  566. } else {
  567. left = 0;
  568. }
  569. /* bisection root-finding method */
  570. while (left + 1 < right) {
  571. mid = (left + right) / 2;
  572. if (__kmp_pow<UT>(x, mid) > target) {
  573. left = mid;
  574. } else {
  575. right = mid;
  576. }
  577. } // while
  578. cross = right;
  579. }
  580. /* assert sanity of computed crossover point */
  581. KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
  582. __kmp_pow<UT>(x, cross) <= target);
  583. /* save the crossover point in thread private dispatch structure */
  584. pr->u.p.parm2 = cross;
  585. // C75803
  586. #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
  587. #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
  588. #else
  589. #define GUIDED_ANALYTICAL_WORKAROUND (x)
  590. #endif
  591. /* dynamic-style scheduling offset */
  592. pr->u.p.count = tc -
  593. __kmp_dispatch_guided_remaining(
  594. tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
  595. cross * chunk;
  596. #if KMP_USE_X87CONTROL
  597. // restore FPCW
  598. _control87(oldFpcw, _MCW_PC);
  599. #endif
  600. } // if
  601. } else {
  602. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
  603. "kmp_sch_static_greedy\n",
  604. gtid));
  605. schedule = kmp_sch_static_greedy;
  606. /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
  607. pr->u.p.parm1 = tc;
  608. } // if
  609. } // case
  610. break;
  611. case kmp_sch_static_greedy:
  612. KD_TRACE(
  613. 100,
  614. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
  615. gtid));
  616. pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
  617. break;
  618. case kmp_sch_static_chunked:
  619. case kmp_sch_dynamic_chunked:
  620. dynamic_init:
  621. if (tc == 0)
  622. break;
  623. if (pr->u.p.parm1 <= 0)
  624. pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
  625. else if (pr->u.p.parm1 > tc)
  626. pr->u.p.parm1 = tc;
  627. // Store the total number of chunks to prevent integer overflow during
  628. // bounds calculations in the get next chunk routine.
  629. pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
  630. KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
  631. "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
  632. gtid));
  633. break;
  634. case kmp_sch_trapezoidal: {
  635. /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
  636. T parm1, parm2, parm3, parm4;
  637. KD_TRACE(100,
  638. ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
  639. gtid));
  640. parm1 = chunk;
  641. /* F : size of the first cycle */
  642. parm2 = (tc / (2 * nproc));
  643. if (parm2 < 1) {
  644. parm2 = 1;
  645. }
  646. /* L : size of the last cycle. Make sure the last cycle is not larger
  647. than the first cycle. */
  648. if (parm1 < 1) {
  649. parm1 = 1;
  650. } else if (parm1 > parm2) {
  651. parm1 = parm2;
  652. }
  653. /* N : number of cycles */
  654. parm3 = (parm2 + parm1);
  655. parm3 = (2 * tc + parm3 - 1) / parm3;
  656. if (parm3 < 2) {
  657. parm3 = 2;
  658. }
  659. /* sigma : decreasing incr of the trapezoid */
  660. parm4 = (parm3 - 1);
  661. parm4 = (parm2 - parm1) / parm4;
  662. // pointless check, because parm4 >= 0 always
  663. // if ( parm4 < 0 ) {
  664. // parm4 = 0;
  665. //}
  666. pr->u.p.parm1 = parm1;
  667. pr->u.p.parm2 = parm2;
  668. pr->u.p.parm3 = parm3;
  669. pr->u.p.parm4 = parm4;
  670. } // case
  671. break;
  672. default: {
  673. __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
  674. KMP_HNT(GetNewerLibrary), // Hint
  675. __kmp_msg_null // Variadic argument list terminator
  676. );
  677. } break;
  678. } // switch
  679. pr->schedule = schedule;
  680. }
  681. #if KMP_USE_HIER_SCHED
  682. template <typename T>
  683. inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
  684. typename traits_t<T>::signed_t st);
  685. template <>
  686. inline void
  687. __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
  688. kmp_int32 ub, kmp_int32 st) {
  689. __kmp_dispatch_init_hierarchy<kmp_int32>(
  690. loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
  691. __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
  692. }
  693. template <>
  694. inline void
  695. __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
  696. kmp_uint32 ub, kmp_int32 st) {
  697. __kmp_dispatch_init_hierarchy<kmp_uint32>(
  698. loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
  699. __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
  700. }
  701. template <>
  702. inline void
  703. __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
  704. kmp_int64 ub, kmp_int64 st) {
  705. __kmp_dispatch_init_hierarchy<kmp_int64>(
  706. loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
  707. __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
  708. }
  709. template <>
  710. inline void
  711. __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
  712. kmp_uint64 ub, kmp_int64 st) {
  713. __kmp_dispatch_init_hierarchy<kmp_uint64>(
  714. loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
  715. __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
  716. }
  717. // free all the hierarchy scheduling memory associated with the team
  718. void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
  719. int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
  720. for (int i = 0; i < num_disp_buff; ++i) {
  721. // type does not matter here so use kmp_int32
  722. auto sh =
  723. reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
  724. &team->t.t_disp_buffer[i]);
  725. if (sh->hier) {
  726. sh->hier->deallocate();
  727. __kmp_free(sh->hier);
  728. }
  729. }
  730. }
  731. #endif
  732. // UT - unsigned flavor of T, ST - signed flavor of T,
  733. // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
  734. template <typename T>
  735. static void
  736. __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
  737. T ub, typename traits_t<T>::signed_t st,
  738. typename traits_t<T>::signed_t chunk, int push_ws) {
  739. typedef typename traits_t<T>::unsigned_t UT;
  740. int active;
  741. kmp_info_t *th;
  742. kmp_team_t *team;
  743. kmp_uint32 my_buffer_index;
  744. dispatch_private_info_template<T> *pr;
  745. dispatch_shared_info_template<T> volatile *sh;
  746. KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
  747. sizeof(dispatch_private_info));
  748. KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
  749. sizeof(dispatch_shared_info));
  750. __kmp_assert_valid_gtid(gtid);
  751. if (!TCR_4(__kmp_init_parallel))
  752. __kmp_parallel_initialize();
  753. __kmp_resume_if_soft_paused();
  754. #if INCLUDE_SSC_MARKS
  755. SSC_MARK_DISPATCH_INIT();
  756. #endif
  757. #ifdef KMP_DEBUG
  758. typedef typename traits_t<T>::signed_t ST;
  759. {
  760. char *buff;
  761. // create format specifiers before the debug output
  762. buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
  763. "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
  764. traits_t<ST>::spec, traits_t<T>::spec,
  765. traits_t<T>::spec, traits_t<ST>::spec);
  766. KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
  767. __kmp_str_free(&buff);
  768. }
  769. #endif
  770. /* setup data */
  771. th = __kmp_threads[gtid];
  772. team = th->th.th_team;
  773. active = !team->t.t_serialized;
  774. th->th.th_ident = loc;
  775. // Any half-decent optimizer will remove this test when the blocks are empty
  776. // since the macros expand to nothing
  777. // when statistics are disabled.
  778. if (schedule == __kmp_static) {
  779. KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
  780. } else {
  781. KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
  782. }
  783. #if KMP_USE_HIER_SCHED
  784. // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
  785. // Hierarchical scheduling does not work with ordered, so if ordered is
  786. // detected, then revert back to threaded scheduling.
  787. bool ordered;
  788. enum sched_type my_sched = schedule;
  789. my_buffer_index = th->th.th_dispatch->th_disp_index;
  790. pr = reinterpret_cast<dispatch_private_info_template<T> *>(
  791. &th->th.th_dispatch
  792. ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
  793. my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
  794. if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
  795. my_sched =
  796. (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
  797. ordered = (kmp_ord_lower & my_sched);
  798. if (pr->flags.use_hier) {
  799. if (ordered) {
  800. KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
  801. "Disabling hierarchical scheduling.\n",
  802. gtid));
  803. pr->flags.use_hier = FALSE;
  804. }
  805. }
  806. if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
  807. // Don't use hierarchical for ordered parallel loops and don't
  808. // use the runtime hierarchy if one was specified in the program
  809. if (!ordered && !pr->flags.use_hier)
  810. __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
  811. }
  812. #endif // KMP_USE_HIER_SCHED
  813. #if USE_ITT_BUILD
  814. kmp_uint64 cur_chunk = chunk;
  815. int itt_need_metadata_reporting =
  816. __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
  817. KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
  818. team->t.t_active_level == 1;
  819. #endif
  820. if (!active) {
  821. pr = reinterpret_cast<dispatch_private_info_template<T> *>(
  822. th->th.th_dispatch->th_disp_buffer); /* top of the stack */
  823. } else {
  824. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  825. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  826. my_buffer_index = th->th.th_dispatch->th_disp_index++;
  827. /* What happens when number of threads changes, need to resize buffer? */
  828. pr = reinterpret_cast<dispatch_private_info_template<T> *>(
  829. &th->th.th_dispatch
  830. ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
  831. sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
  832. &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
  833. KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
  834. my_buffer_index));
  835. if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
  836. KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
  837. " sh->buffer_index:%d\n",
  838. gtid, my_buffer_index, sh->buffer_index));
  839. __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
  840. __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
  841. // Note: KMP_WAIT() cannot be used there: buffer index and
  842. // my_buffer_index are *always* 32-bit integers.
  843. KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
  844. "sh->buffer_index:%d\n",
  845. gtid, my_buffer_index, sh->buffer_index));
  846. }
  847. }
  848. __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
  849. #if USE_ITT_BUILD
  850. &cur_chunk,
  851. #endif
  852. chunk, (T)th->th.th_team_nproc,
  853. (T)th->th.th_info.ds.ds_tid);
  854. if (active) {
  855. if (pr->flags.ordered == 0) {
  856. th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
  857. th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
  858. } else {
  859. th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
  860. th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
  861. }
  862. th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
  863. th->th.th_dispatch->th_dispatch_sh_current =
  864. CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
  865. #if USE_ITT_BUILD
  866. if (pr->flags.ordered) {
  867. __kmp_itt_ordered_init(gtid);
  868. }
  869. // Report loop metadata
  870. if (itt_need_metadata_reporting) {
  871. // Only report metadata by primary thread of active team at level 1
  872. kmp_uint64 schedtype = 0;
  873. switch (schedule) {
  874. case kmp_sch_static_chunked:
  875. case kmp_sch_static_balanced: // Chunk is calculated in the switch above
  876. break;
  877. case kmp_sch_static_greedy:
  878. cur_chunk = pr->u.p.parm1;
  879. break;
  880. case kmp_sch_dynamic_chunked:
  881. schedtype = 1;
  882. break;
  883. case kmp_sch_guided_iterative_chunked:
  884. case kmp_sch_guided_analytical_chunked:
  885. case kmp_sch_guided_simd:
  886. schedtype = 2;
  887. break;
  888. default:
  889. // Should we put this case under "static"?
  890. // case kmp_sch_static_steal:
  891. schedtype = 3;
  892. break;
  893. }
  894. __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
  895. }
  896. #if KMP_USE_HIER_SCHED
  897. if (pr->flags.use_hier) {
  898. pr->u.p.count = 0;
  899. pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
  900. }
  901. #endif // KMP_USER_HIER_SCHED
  902. #endif /* USE_ITT_BUILD */
  903. }
  904. #ifdef KMP_DEBUG
  905. {
  906. char *buff;
  907. // create format specifiers before the debug output
  908. buff = __kmp_str_format(
  909. "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
  910. "lb:%%%s ub:%%%s"
  911. " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
  912. " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
  913. traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
  914. traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
  915. traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
  916. traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
  917. KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
  918. pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
  919. pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
  920. pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
  921. __kmp_str_free(&buff);
  922. }
  923. #endif
  924. #if OMPT_SUPPORT && OMPT_OPTIONAL
  925. if (ompt_enabled.ompt_callback_work) {
  926. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
  927. ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
  928. ompt_callbacks.ompt_callback(ompt_callback_work)(
  929. ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
  930. &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
  931. }
  932. #endif
  933. KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
  934. }
  935. /* For ordered loops, either __kmp_dispatch_finish() should be called after
  936. * every iteration, or __kmp_dispatch_finish_chunk() should be called after
  937. * every chunk of iterations. If the ordered section(s) were not executed
  938. * for this iteration (or every iteration in this chunk), we need to set the
  939. * ordered iteration counters so that the next thread can proceed. */
  940. template <typename UT>
  941. static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
  942. typedef typename traits_t<UT>::signed_t ST;
  943. __kmp_assert_valid_gtid(gtid);
  944. kmp_info_t *th = __kmp_threads[gtid];
  945. KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
  946. if (!th->th.th_team->t.t_serialized) {
  947. dispatch_private_info_template<UT> *pr =
  948. reinterpret_cast<dispatch_private_info_template<UT> *>(
  949. th->th.th_dispatch->th_dispatch_pr_current);
  950. dispatch_shared_info_template<UT> volatile *sh =
  951. reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
  952. th->th.th_dispatch->th_dispatch_sh_current);
  953. KMP_DEBUG_ASSERT(pr);
  954. KMP_DEBUG_ASSERT(sh);
  955. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  956. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  957. if (pr->ordered_bumped) {
  958. KD_TRACE(
  959. 1000,
  960. ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
  961. gtid));
  962. pr->ordered_bumped = 0;
  963. } else {
  964. UT lower = pr->u.p.ordered_lower;
  965. #ifdef KMP_DEBUG
  966. {
  967. char *buff;
  968. // create format specifiers before the debug output
  969. buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
  970. "ordered_iteration:%%%s lower:%%%s\n",
  971. traits_t<UT>::spec, traits_t<UT>::spec);
  972. KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
  973. __kmp_str_free(&buff);
  974. }
  975. #endif
  976. __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
  977. __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
  978. KMP_MB(); /* is this necessary? */
  979. #ifdef KMP_DEBUG
  980. {
  981. char *buff;
  982. // create format specifiers before the debug output
  983. buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
  984. "ordered_iteration:%%%s lower:%%%s\n",
  985. traits_t<UT>::spec, traits_t<UT>::spec);
  986. KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
  987. __kmp_str_free(&buff);
  988. }
  989. #endif
  990. test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
  991. } // if
  992. } // if
  993. KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
  994. }
  995. #ifdef KMP_GOMP_COMPAT
  996. template <typename UT>
  997. static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
  998. typedef typename traits_t<UT>::signed_t ST;
  999. __kmp_assert_valid_gtid(gtid);
  1000. kmp_info_t *th = __kmp_threads[gtid];
  1001. KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
  1002. if (!th->th.th_team->t.t_serialized) {
  1003. dispatch_private_info_template<UT> *pr =
  1004. reinterpret_cast<dispatch_private_info_template<UT> *>(
  1005. th->th.th_dispatch->th_dispatch_pr_current);
  1006. dispatch_shared_info_template<UT> volatile *sh =
  1007. reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
  1008. th->th.th_dispatch->th_dispatch_sh_current);
  1009. KMP_DEBUG_ASSERT(pr);
  1010. KMP_DEBUG_ASSERT(sh);
  1011. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  1012. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  1013. UT lower = pr->u.p.ordered_lower;
  1014. UT upper = pr->u.p.ordered_upper;
  1015. UT inc = upper - lower + 1;
  1016. if (pr->ordered_bumped == inc) {
  1017. KD_TRACE(
  1018. 1000,
  1019. ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
  1020. gtid));
  1021. pr->ordered_bumped = 0;
  1022. } else {
  1023. inc -= pr->ordered_bumped;
  1024. #ifdef KMP_DEBUG
  1025. {
  1026. char *buff;
  1027. // create format specifiers before the debug output
  1028. buff = __kmp_str_format(
  1029. "__kmp_dispatch_finish_chunk: T#%%d before wait: "
  1030. "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
  1031. traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
  1032. KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
  1033. __kmp_str_free(&buff);
  1034. }
  1035. #endif
  1036. __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
  1037. __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
  1038. KMP_MB(); /* is this necessary? */
  1039. KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
  1040. "ordered_bumped to zero\n",
  1041. gtid));
  1042. pr->ordered_bumped = 0;
  1043. //!!!!! TODO check if the inc should be unsigned, or signed???
  1044. #ifdef KMP_DEBUG
  1045. {
  1046. char *buff;
  1047. // create format specifiers before the debug output
  1048. buff = __kmp_str_format(
  1049. "__kmp_dispatch_finish_chunk: T#%%d after wait: "
  1050. "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
  1051. traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
  1052. traits_t<UT>::spec);
  1053. KD_TRACE(1000,
  1054. (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
  1055. __kmp_str_free(&buff);
  1056. }
  1057. #endif
  1058. test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
  1059. }
  1060. // }
  1061. }
  1062. KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
  1063. }
  1064. #endif /* KMP_GOMP_COMPAT */
  1065. template <typename T>
  1066. int __kmp_dispatch_next_algorithm(int gtid,
  1067. dispatch_private_info_template<T> *pr,
  1068. dispatch_shared_info_template<T> volatile *sh,
  1069. kmp_int32 *p_last, T *p_lb, T *p_ub,
  1070. typename traits_t<T>::signed_t *p_st, T nproc,
  1071. T tid) {
  1072. typedef typename traits_t<T>::unsigned_t UT;
  1073. typedef typename traits_t<T>::signed_t ST;
  1074. typedef typename traits_t<T>::floating_t DBL;
  1075. int status = 0;
  1076. bool last = false;
  1077. T start;
  1078. ST incr;
  1079. UT limit, trip, init;
  1080. kmp_info_t *th = __kmp_threads[gtid];
  1081. kmp_team_t *team = th->th.th_team;
  1082. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  1083. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  1084. KMP_DEBUG_ASSERT(pr);
  1085. KMP_DEBUG_ASSERT(sh);
  1086. KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
  1087. #ifdef KMP_DEBUG
  1088. {
  1089. char *buff;
  1090. // create format specifiers before the debug output
  1091. buff =
  1092. __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
  1093. "sh:%%p nproc:%%%s tid:%%%s\n",
  1094. traits_t<T>::spec, traits_t<T>::spec);
  1095. KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
  1096. __kmp_str_free(&buff);
  1097. }
  1098. #endif
  1099. // zero trip count
  1100. if (pr->u.p.tc == 0) {
  1101. KD_TRACE(10,
  1102. ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
  1103. "zero status:%d\n",
  1104. gtid, status));
  1105. return 0;
  1106. }
  1107. switch (pr->schedule) {
  1108. #if KMP_STATIC_STEAL_ENABLED
  1109. case kmp_sch_static_steal: {
  1110. T chunk = pr->u.p.parm1;
  1111. UT nchunks = pr->u.p.parm2;
  1112. KD_TRACE(100,
  1113. ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
  1114. gtid));
  1115. trip = pr->u.p.tc - 1;
  1116. if (traits_t<T>::type_size > 4) {
  1117. // use lock for 8-byte induction variable.
  1118. // TODO (optional): check presence and use 16-byte CAS
  1119. kmp_lock_t *lck = pr->u.p.steal_lock;
  1120. KMP_DEBUG_ASSERT(lck != NULL);
  1121. if (pr->u.p.count < (UT)pr->u.p.ub) {
  1122. KMP_DEBUG_ASSERT(pr->steal_flag == READY);
  1123. __kmp_acquire_lock(lck, gtid);
  1124. // try to get own chunk of iterations
  1125. init = (pr->u.p.count)++;
  1126. status = (init < (UT)pr->u.p.ub);
  1127. __kmp_release_lock(lck, gtid);
  1128. } else {
  1129. status = 0; // no own chunks
  1130. }
  1131. if (!status) { // try to steal
  1132. kmp_lock_t *lckv; // victim buffer's lock
  1133. T while_limit = pr->u.p.parm3;
  1134. T while_index = 0;
  1135. int idx = (th->th.th_dispatch->th_disp_index - 1) %
  1136. __kmp_dispatch_num_buffers; // current loop index
  1137. // note: victim thread can potentially execute another loop
  1138. KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
  1139. while ((!status) && (while_limit != ++while_index)) {
  1140. dispatch_private_info_template<T> *v;
  1141. T remaining;
  1142. T victimId = pr->u.p.parm4;
  1143. T oldVictimId = victimId ? victimId - 1 : nproc - 1;
  1144. v = reinterpret_cast<dispatch_private_info_template<T> *>(
  1145. &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
  1146. KMP_DEBUG_ASSERT(v);
  1147. while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
  1148. oldVictimId != victimId) {
  1149. victimId = (victimId + 1) % nproc;
  1150. v = reinterpret_cast<dispatch_private_info_template<T> *>(
  1151. &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
  1152. KMP_DEBUG_ASSERT(v);
  1153. }
  1154. if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
  1155. continue; // try once more (nproc attempts in total)
  1156. }
  1157. if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
  1158. kmp_uint32 old = UNUSED;
  1159. // try to steal whole range from inactive victim
  1160. status = v->steal_flag.compare_exchange_strong(old, THIEF);
  1161. if (status) {
  1162. // initialize self buffer with victim's whole range of chunks
  1163. T id = victimId;
  1164. T small_chunk, extras;
  1165. small_chunk = nchunks / nproc; // chunks per thread
  1166. extras = nchunks % nproc;
  1167. init = id * small_chunk + (id < extras ? id : extras);
  1168. __kmp_acquire_lock(lck, gtid);
  1169. pr->u.p.count = init + 1; // exclude one we execute immediately
  1170. pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
  1171. __kmp_release_lock(lck, gtid);
  1172. pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
  1173. // no need to reinitialize other thread invariants: lb, st, etc.
  1174. #ifdef KMP_DEBUG
  1175. {
  1176. char *buff;
  1177. // create format specifiers before the debug output
  1178. buff = __kmp_str_format(
  1179. "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
  1180. "count:%%%s ub:%%%s\n",
  1181. traits_t<UT>::spec, traits_t<T>::spec);
  1182. KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
  1183. __kmp_str_free(&buff);
  1184. }
  1185. #endif
  1186. // activate non-empty buffer and let others steal from us
  1187. if (pr->u.p.count < (UT)pr->u.p.ub)
  1188. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
  1189. break;
  1190. }
  1191. }
  1192. if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
  1193. v->u.p.count >= (UT)v->u.p.ub) {
  1194. pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
  1195. continue; // no chunks to steal, try next victim
  1196. }
  1197. lckv = v->u.p.steal_lock;
  1198. KMP_ASSERT(lckv != NULL);
  1199. __kmp_acquire_lock(lckv, gtid);
  1200. limit = v->u.p.ub; // keep initial ub
  1201. if (v->u.p.count >= limit) {
  1202. __kmp_release_lock(lckv, gtid);
  1203. pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
  1204. continue; // no chunks to steal, try next victim
  1205. }
  1206. // stealing succeded, reduce victim's ub by 1/4 of undone chunks
  1207. // TODO: is this heuristics good enough??
  1208. remaining = limit - v->u.p.count;
  1209. if (remaining > 7) {
  1210. // steal 1/4 of remaining
  1211. KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
  1212. init = (v->u.p.ub -= (remaining >> 2));
  1213. } else {
  1214. // steal 1 chunk of 1..7 remaining
  1215. KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
  1216. init = (v->u.p.ub -= 1);
  1217. }
  1218. __kmp_release_lock(lckv, gtid);
  1219. #ifdef KMP_DEBUG
  1220. {
  1221. char *buff;
  1222. // create format specifiers before the debug output
  1223. buff = __kmp_str_format(
  1224. "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
  1225. "count:%%%s ub:%%%s\n",
  1226. traits_t<UT>::spec, traits_t<UT>::spec);
  1227. KD_TRACE(10, (buff, gtid, victimId, init, limit));
  1228. __kmp_str_free(&buff);
  1229. }
  1230. #endif
  1231. KMP_DEBUG_ASSERT(init + 1 <= limit);
  1232. pr->u.p.parm4 = victimId; // remember victim to steal from
  1233. status = 1;
  1234. // now update own count and ub with stolen range excluding init chunk
  1235. __kmp_acquire_lock(lck, gtid);
  1236. pr->u.p.count = init + 1;
  1237. pr->u.p.ub = limit;
  1238. __kmp_release_lock(lck, gtid);
  1239. // activate non-empty buffer and let others steal from us
  1240. if (init + 1 < limit)
  1241. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
  1242. } // while (search for victim)
  1243. } // if (try to find victim and steal)
  1244. } else {
  1245. // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
  1246. // as all operations on pair (count, ub) must be done atomically
  1247. typedef union {
  1248. struct {
  1249. UT count;
  1250. T ub;
  1251. } p;
  1252. kmp_int64 b;
  1253. } union_i4;
  1254. union_i4 vold, vnew;
  1255. if (pr->u.p.count < (UT)pr->u.p.ub) {
  1256. KMP_DEBUG_ASSERT(pr->steal_flag == READY);
  1257. vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
  1258. vnew.b = vold.b;
  1259. vnew.p.count++; // get chunk from head of self range
  1260. while (!KMP_COMPARE_AND_STORE_REL64(
  1261. (volatile kmp_int64 *)&pr->u.p.count,
  1262. *VOLATILE_CAST(kmp_int64 *) & vold.b,
  1263. *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
  1264. KMP_CPU_PAUSE();
  1265. vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
  1266. vnew.b = vold.b;
  1267. vnew.p.count++;
  1268. }
  1269. init = vold.p.count;
  1270. status = (init < (UT)vold.p.ub);
  1271. } else {
  1272. status = 0; // no own chunks
  1273. }
  1274. if (!status) { // try to steal
  1275. T while_limit = pr->u.p.parm3;
  1276. T while_index = 0;
  1277. int idx = (th->th.th_dispatch->th_disp_index - 1) %
  1278. __kmp_dispatch_num_buffers; // current loop index
  1279. // note: victim thread can potentially execute another loop
  1280. KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
  1281. while ((!status) && (while_limit != ++while_index)) {
  1282. dispatch_private_info_template<T> *v;
  1283. T remaining;
  1284. T victimId = pr->u.p.parm4;
  1285. T oldVictimId = victimId ? victimId - 1 : nproc - 1;
  1286. v = reinterpret_cast<dispatch_private_info_template<T> *>(
  1287. &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
  1288. KMP_DEBUG_ASSERT(v);
  1289. while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
  1290. oldVictimId != victimId) {
  1291. victimId = (victimId + 1) % nproc;
  1292. v = reinterpret_cast<dispatch_private_info_template<T> *>(
  1293. &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
  1294. KMP_DEBUG_ASSERT(v);
  1295. }
  1296. if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
  1297. continue; // try once more (nproc attempts in total)
  1298. }
  1299. if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
  1300. kmp_uint32 old = UNUSED;
  1301. // try to steal whole range from inactive victim
  1302. status = v->steal_flag.compare_exchange_strong(old, THIEF);
  1303. if (status) {
  1304. // initialize self buffer with victim's whole range of chunks
  1305. T id = victimId;
  1306. T small_chunk, extras;
  1307. small_chunk = nchunks / nproc; // chunks per thread
  1308. extras = nchunks % nproc;
  1309. init = id * small_chunk + (id < extras ? id : extras);
  1310. vnew.p.count = init + 1;
  1311. vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
  1312. // write pair (count, ub) at once atomically
  1313. #if KMP_ARCH_X86
  1314. KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
  1315. #else
  1316. *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
  1317. #endif
  1318. pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
  1319. // no need to initialize other thread invariants: lb, st, etc.
  1320. #ifdef KMP_DEBUG
  1321. {
  1322. char *buff;
  1323. // create format specifiers before the debug output
  1324. buff = __kmp_str_format(
  1325. "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
  1326. "count:%%%s ub:%%%s\n",
  1327. traits_t<UT>::spec, traits_t<T>::spec);
  1328. KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
  1329. __kmp_str_free(&buff);
  1330. }
  1331. #endif
  1332. // activate non-empty buffer and let others steal from us
  1333. if (pr->u.p.count < (UT)pr->u.p.ub)
  1334. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
  1335. break;
  1336. }
  1337. }
  1338. while (1) { // CAS loop with check if victim still has enough chunks
  1339. // many threads may be stealing concurrently from same victim
  1340. vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
  1341. if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
  1342. vold.p.count >= (UT)vold.p.ub) {
  1343. pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
  1344. break; // no chunks to steal, try next victim
  1345. }
  1346. vnew.b = vold.b;
  1347. remaining = vold.p.ub - vold.p.count;
  1348. // try to steal 1/4 of remaining
  1349. // TODO: is this heuristics good enough??
  1350. if (remaining > 7) {
  1351. vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
  1352. } else {
  1353. vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
  1354. }
  1355. KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
  1356. if (KMP_COMPARE_AND_STORE_REL64(
  1357. (volatile kmp_int64 *)&v->u.p.count,
  1358. *VOLATILE_CAST(kmp_int64 *) & vold.b,
  1359. *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
  1360. // stealing succedded
  1361. #ifdef KMP_DEBUG
  1362. {
  1363. char *buff;
  1364. // create format specifiers before the debug output
  1365. buff = __kmp_str_format(
  1366. "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
  1367. "count:%%%s ub:%%%s\n",
  1368. traits_t<T>::spec, traits_t<T>::spec);
  1369. KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
  1370. __kmp_str_free(&buff);
  1371. }
  1372. #endif
  1373. KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
  1374. vold.p.ub - vnew.p.ub);
  1375. status = 1;
  1376. pr->u.p.parm4 = victimId; // keep victim id
  1377. // now update own count and ub
  1378. init = vnew.p.ub;
  1379. vold.p.count = init + 1;
  1380. #if KMP_ARCH_X86
  1381. KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
  1382. #else
  1383. *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
  1384. #endif
  1385. // activate non-empty buffer and let others steal from us
  1386. if (vold.p.count < (UT)vold.p.ub)
  1387. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
  1388. break;
  1389. } // if (check CAS result)
  1390. KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
  1391. } // while (try to steal from particular victim)
  1392. } // while (search for victim)
  1393. } // if (try to find victim and steal)
  1394. } // if (4-byte induction variable)
  1395. if (!status) {
  1396. *p_lb = 0;
  1397. *p_ub = 0;
  1398. if (p_st != NULL)
  1399. *p_st = 0;
  1400. } else {
  1401. start = pr->u.p.lb;
  1402. init *= chunk;
  1403. limit = chunk + init - 1;
  1404. incr = pr->u.p.st;
  1405. KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
  1406. KMP_DEBUG_ASSERT(init <= trip);
  1407. // keep track of done chunks for possible early exit from stealing
  1408. // TODO: count executed chunks locally with rare update of shared location
  1409. // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
  1410. if ((last = (limit >= trip)) != 0)
  1411. limit = trip;
  1412. if (p_st != NULL)
  1413. *p_st = incr;
  1414. if (incr == 1) {
  1415. *p_lb = start + init;
  1416. *p_ub = start + limit;
  1417. } else {
  1418. *p_lb = start + init * incr;
  1419. *p_ub = start + limit * incr;
  1420. }
  1421. } // if
  1422. break;
  1423. } // case
  1424. #endif // KMP_STATIC_STEAL_ENABLED
  1425. case kmp_sch_static_balanced: {
  1426. KD_TRACE(
  1427. 10,
  1428. ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
  1429. gtid));
  1430. /* check if thread has any iteration to do */
  1431. if ((status = !pr->u.p.count) != 0) {
  1432. pr->u.p.count = 1;
  1433. *p_lb = pr->u.p.lb;
  1434. *p_ub = pr->u.p.ub;
  1435. last = (pr->u.p.parm1 != 0);
  1436. if (p_st != NULL)
  1437. *p_st = pr->u.p.st;
  1438. } else { /* no iterations to do */
  1439. pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
  1440. }
  1441. } // case
  1442. break;
  1443. case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
  1444. merged here */
  1445. case kmp_sch_static_chunked: {
  1446. T parm1;
  1447. KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
  1448. "kmp_sch_static_[affinity|chunked] case\n",
  1449. gtid));
  1450. parm1 = pr->u.p.parm1;
  1451. trip = pr->u.p.tc - 1;
  1452. init = parm1 * (pr->u.p.count + tid);
  1453. if ((status = (init <= trip)) != 0) {
  1454. start = pr->u.p.lb;
  1455. incr = pr->u.p.st;
  1456. limit = parm1 + init - 1;
  1457. if ((last = (limit >= trip)) != 0)
  1458. limit = trip;
  1459. if (p_st != NULL)
  1460. *p_st = incr;
  1461. pr->u.p.count += nproc;
  1462. if (incr == 1) {
  1463. *p_lb = start + init;
  1464. *p_ub = start + limit;
  1465. } else {
  1466. *p_lb = start + init * incr;
  1467. *p_ub = start + limit * incr;
  1468. }
  1469. if (pr->flags.ordered) {
  1470. pr->u.p.ordered_lower = init;
  1471. pr->u.p.ordered_upper = limit;
  1472. } // if
  1473. } // if
  1474. } // case
  1475. break;
  1476. case kmp_sch_dynamic_chunked: {
  1477. UT chunk_number;
  1478. UT chunk_size = pr->u.p.parm1;
  1479. UT nchunks = pr->u.p.parm2;
  1480. KD_TRACE(
  1481. 100,
  1482. ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
  1483. gtid));
  1484. chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
  1485. status = (chunk_number < nchunks);
  1486. if (!status) {
  1487. *p_lb = 0;
  1488. *p_ub = 0;
  1489. if (p_st != NULL)
  1490. *p_st = 0;
  1491. } else {
  1492. init = chunk_size * chunk_number;
  1493. trip = pr->u.p.tc - 1;
  1494. start = pr->u.p.lb;
  1495. incr = pr->u.p.st;
  1496. if ((last = (trip - init < (UT)chunk_size)))
  1497. limit = trip;
  1498. else
  1499. limit = chunk_size + init - 1;
  1500. if (p_st != NULL)
  1501. *p_st = incr;
  1502. if (incr == 1) {
  1503. *p_lb = start + init;
  1504. *p_ub = start + limit;
  1505. } else {
  1506. *p_lb = start + init * incr;
  1507. *p_ub = start + limit * incr;
  1508. }
  1509. if (pr->flags.ordered) {
  1510. pr->u.p.ordered_lower = init;
  1511. pr->u.p.ordered_upper = limit;
  1512. } // if
  1513. } // if
  1514. } // case
  1515. break;
  1516. case kmp_sch_guided_iterative_chunked: {
  1517. T chunkspec = pr->u.p.parm1;
  1518. KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
  1519. "iterative case\n",
  1520. gtid));
  1521. trip = pr->u.p.tc;
  1522. // Start atomic part of calculations
  1523. while (1) {
  1524. ST remaining; // signed, because can be < 0
  1525. init = sh->u.s.iteration; // shared value
  1526. remaining = trip - init;
  1527. if (remaining <= 0) { // AC: need to compare with 0 first
  1528. // nothing to do, don't try atomic op
  1529. status = 0;
  1530. break;
  1531. }
  1532. if ((T)remaining <
  1533. pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
  1534. // use dynamic-style schedule
  1535. // atomically increment iterations, get old value
  1536. init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
  1537. (ST)chunkspec);
  1538. remaining = trip - init;
  1539. if (remaining <= 0) {
  1540. status = 0; // all iterations got by other threads
  1541. } else {
  1542. // got some iterations to work on
  1543. status = 1;
  1544. if ((T)remaining > chunkspec) {
  1545. limit = init + chunkspec - 1;
  1546. } else {
  1547. last = true; // the last chunk
  1548. limit = init + remaining - 1;
  1549. } // if
  1550. } // if
  1551. break;
  1552. } // if
  1553. limit = init + (UT)((double)remaining *
  1554. *(double *)&pr->u.p.parm3); // divide by K*nproc
  1555. if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
  1556. (ST)init, (ST)limit)) {
  1557. // CAS was successful, chunk obtained
  1558. status = 1;
  1559. --limit;
  1560. break;
  1561. } // if
  1562. } // while
  1563. if (status != 0) {
  1564. start = pr->u.p.lb;
  1565. incr = pr->u.p.st;
  1566. if (p_st != NULL)
  1567. *p_st = incr;
  1568. *p_lb = start + init * incr;
  1569. *p_ub = start + limit * incr;
  1570. if (pr->flags.ordered) {
  1571. pr->u.p.ordered_lower = init;
  1572. pr->u.p.ordered_upper = limit;
  1573. } // if
  1574. } else {
  1575. *p_lb = 0;
  1576. *p_ub = 0;
  1577. if (p_st != NULL)
  1578. *p_st = 0;
  1579. } // if
  1580. } // case
  1581. break;
  1582. case kmp_sch_guided_simd: {
  1583. // same as iterative but curr-chunk adjusted to be multiple of given
  1584. // chunk
  1585. T chunk = pr->u.p.parm1;
  1586. KD_TRACE(100,
  1587. ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
  1588. gtid));
  1589. trip = pr->u.p.tc;
  1590. // Start atomic part of calculations
  1591. while (1) {
  1592. ST remaining; // signed, because can be < 0
  1593. init = sh->u.s.iteration; // shared value
  1594. remaining = trip - init;
  1595. if (remaining <= 0) { // AC: need to compare with 0 first
  1596. status = 0; // nothing to do, don't try atomic op
  1597. break;
  1598. }
  1599. KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
  1600. // compare with K*nproc*(chunk+1), K=2 by default
  1601. if ((T)remaining < pr->u.p.parm2) {
  1602. // use dynamic-style schedule
  1603. // atomically increment iterations, get old value
  1604. init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
  1605. (ST)chunk);
  1606. remaining = trip - init;
  1607. if (remaining <= 0) {
  1608. status = 0; // all iterations got by other threads
  1609. } else {
  1610. // got some iterations to work on
  1611. status = 1;
  1612. if ((T)remaining > chunk) {
  1613. limit = init + chunk - 1;
  1614. } else {
  1615. last = true; // the last chunk
  1616. limit = init + remaining - 1;
  1617. } // if
  1618. } // if
  1619. break;
  1620. } // if
  1621. // divide by K*nproc
  1622. UT span;
  1623. __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
  1624. &span);
  1625. UT rem = span % chunk;
  1626. if (rem) // adjust so that span%chunk == 0
  1627. span += chunk - rem;
  1628. limit = init + span;
  1629. if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
  1630. (ST)init, (ST)limit)) {
  1631. // CAS was successful, chunk obtained
  1632. status = 1;
  1633. --limit;
  1634. break;
  1635. } // if
  1636. } // while
  1637. if (status != 0) {
  1638. start = pr->u.p.lb;
  1639. incr = pr->u.p.st;
  1640. if (p_st != NULL)
  1641. *p_st = incr;
  1642. *p_lb = start + init * incr;
  1643. *p_ub = start + limit * incr;
  1644. if (pr->flags.ordered) {
  1645. pr->u.p.ordered_lower = init;
  1646. pr->u.p.ordered_upper = limit;
  1647. } // if
  1648. } else {
  1649. *p_lb = 0;
  1650. *p_ub = 0;
  1651. if (p_st != NULL)
  1652. *p_st = 0;
  1653. } // if
  1654. } // case
  1655. break;
  1656. case kmp_sch_guided_analytical_chunked: {
  1657. T chunkspec = pr->u.p.parm1;
  1658. UT chunkIdx;
  1659. #if KMP_USE_X87CONTROL
  1660. /* for storing original FPCW value for Windows* OS on
  1661. IA-32 architecture 8-byte version */
  1662. unsigned int oldFpcw;
  1663. unsigned int fpcwSet = 0;
  1664. #endif
  1665. KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
  1666. "kmp_sch_guided_analytical_chunked case\n",
  1667. gtid));
  1668. trip = pr->u.p.tc;
  1669. KMP_DEBUG_ASSERT(nproc > 1);
  1670. KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
  1671. while (1) { /* this while loop is a safeguard against unexpected zero
  1672. chunk sizes */
  1673. chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
  1674. if (chunkIdx >= (UT)pr->u.p.parm2) {
  1675. --trip;
  1676. /* use dynamic-style scheduling */
  1677. init = chunkIdx * chunkspec + pr->u.p.count;
  1678. /* need to verify init > 0 in case of overflow in the above
  1679. * calculation */
  1680. if ((status = (init > 0 && init <= trip)) != 0) {
  1681. limit = init + chunkspec - 1;
  1682. if ((last = (limit >= trip)) != 0)
  1683. limit = trip;
  1684. }
  1685. break;
  1686. } else {
  1687. /* use exponential-style scheduling */
  1688. /* The following check is to workaround the lack of long double precision on
  1689. Windows* OS.
  1690. This check works around the possible effect that init != 0 for chunkIdx == 0.
  1691. */
  1692. #if KMP_USE_X87CONTROL
  1693. /* If we haven't already done so, save original
  1694. FPCW and set precision to 64-bit, as Windows* OS
  1695. on IA-32 architecture defaults to 53-bit */
  1696. if (!fpcwSet) {
  1697. oldFpcw = _control87(0, 0);
  1698. _control87(_PC_64, _MCW_PC);
  1699. fpcwSet = 0x30000;
  1700. }
  1701. #endif
  1702. if (chunkIdx) {
  1703. init = __kmp_dispatch_guided_remaining<T>(
  1704. trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
  1705. KMP_DEBUG_ASSERT(init);
  1706. init = trip - init;
  1707. } else
  1708. init = 0;
  1709. limit = trip - __kmp_dispatch_guided_remaining<T>(
  1710. trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
  1711. KMP_ASSERT(init <= limit);
  1712. if (init < limit) {
  1713. KMP_DEBUG_ASSERT(limit <= trip);
  1714. --limit;
  1715. status = 1;
  1716. break;
  1717. } // if
  1718. } // if
  1719. } // while (1)
  1720. #if KMP_USE_X87CONTROL
  1721. /* restore FPCW if necessary
  1722. AC: check fpcwSet flag first because oldFpcw can be uninitialized here
  1723. */
  1724. if (fpcwSet && (oldFpcw & fpcwSet))
  1725. _control87(oldFpcw, _MCW_PC);
  1726. #endif
  1727. if (status != 0) {
  1728. start = pr->u.p.lb;
  1729. incr = pr->u.p.st;
  1730. if (p_st != NULL)
  1731. *p_st = incr;
  1732. *p_lb = start + init * incr;
  1733. *p_ub = start + limit * incr;
  1734. if (pr->flags.ordered) {
  1735. pr->u.p.ordered_lower = init;
  1736. pr->u.p.ordered_upper = limit;
  1737. }
  1738. } else {
  1739. *p_lb = 0;
  1740. *p_ub = 0;
  1741. if (p_st != NULL)
  1742. *p_st = 0;
  1743. }
  1744. } // case
  1745. break;
  1746. case kmp_sch_trapezoidal: {
  1747. UT index;
  1748. T parm2 = pr->u.p.parm2;
  1749. T parm3 = pr->u.p.parm3;
  1750. T parm4 = pr->u.p.parm4;
  1751. KD_TRACE(100,
  1752. ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
  1753. gtid));
  1754. index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
  1755. init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
  1756. trip = pr->u.p.tc - 1;
  1757. if ((status = ((T)index < parm3 && init <= trip)) == 0) {
  1758. *p_lb = 0;
  1759. *p_ub = 0;
  1760. if (p_st != NULL)
  1761. *p_st = 0;
  1762. } else {
  1763. start = pr->u.p.lb;
  1764. limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
  1765. incr = pr->u.p.st;
  1766. if ((last = (limit >= trip)) != 0)
  1767. limit = trip;
  1768. if (p_st != NULL)
  1769. *p_st = incr;
  1770. if (incr == 1) {
  1771. *p_lb = start + init;
  1772. *p_ub = start + limit;
  1773. } else {
  1774. *p_lb = start + init * incr;
  1775. *p_ub = start + limit * incr;
  1776. }
  1777. if (pr->flags.ordered) {
  1778. pr->u.p.ordered_lower = init;
  1779. pr->u.p.ordered_upper = limit;
  1780. } // if
  1781. } // if
  1782. } // case
  1783. break;
  1784. default: {
  1785. status = 0; // to avoid complaints on uninitialized variable use
  1786. __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
  1787. KMP_HNT(GetNewerLibrary), // Hint
  1788. __kmp_msg_null // Variadic argument list terminator
  1789. );
  1790. } break;
  1791. } // switch
  1792. if (p_last)
  1793. *p_last = last;
  1794. #ifdef KMP_DEBUG
  1795. if (pr->flags.ordered) {
  1796. char *buff;
  1797. // create format specifiers before the debug output
  1798. buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
  1799. "ordered_lower:%%%s ordered_upper:%%%s\n",
  1800. traits_t<UT>::spec, traits_t<UT>::spec);
  1801. KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
  1802. __kmp_str_free(&buff);
  1803. }
  1804. {
  1805. char *buff;
  1806. // create format specifiers before the debug output
  1807. buff = __kmp_str_format(
  1808. "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
  1809. "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
  1810. traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
  1811. KMP_DEBUG_ASSERT(p_last);
  1812. KMP_DEBUG_ASSERT(p_st);
  1813. KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
  1814. __kmp_str_free(&buff);
  1815. }
  1816. #endif
  1817. return status;
  1818. }
  1819. /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
  1820. work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
  1821. is not called. */
  1822. #if OMPT_SUPPORT && OMPT_OPTIONAL
  1823. #define OMPT_LOOP_END \
  1824. if (status == 0) { \
  1825. if (ompt_enabled.ompt_callback_work) { \
  1826. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
  1827. ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
  1828. ompt_callbacks.ompt_callback(ompt_callback_work)( \
  1829. ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
  1830. &(task_info->task_data), 0, codeptr); \
  1831. } \
  1832. }
  1833. #define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
  1834. if (ompt_enabled.ompt_callback_dispatch && status) { \
  1835. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
  1836. ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
  1837. ompt_dispatch_chunk_t chunk; \
  1838. ompt_data_t instance = ompt_data_none; \
  1839. OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
  1840. instance.ptr = &chunk; \
  1841. ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
  1842. &(team_info->parallel_data), &(task_info->task_data), \
  1843. ompt_dispatch_ws_loop_chunk, instance); \
  1844. }
  1845. // TODO: implement count
  1846. #else
  1847. #define OMPT_LOOP_END // no-op
  1848. #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
  1849. #endif
  1850. #if KMP_STATS_ENABLED
  1851. #define KMP_STATS_LOOP_END \
  1852. { \
  1853. kmp_int64 u, l, t, i; \
  1854. l = (kmp_int64)(*p_lb); \
  1855. u = (kmp_int64)(*p_ub); \
  1856. i = (kmp_int64)(pr->u.p.st); \
  1857. if (status == 0) { \
  1858. t = 0; \
  1859. KMP_POP_PARTITIONED_TIMER(); \
  1860. } else if (i == 1) { \
  1861. if (u >= l) \
  1862. t = u - l + 1; \
  1863. else \
  1864. t = 0; \
  1865. } else if (i < 0) { \
  1866. if (l >= u) \
  1867. t = (l - u) / (-i) + 1; \
  1868. else \
  1869. t = 0; \
  1870. } else { \
  1871. if (u >= l) \
  1872. t = (u - l) / i + 1; \
  1873. else \
  1874. t = 0; \
  1875. } \
  1876. KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
  1877. }
  1878. #else
  1879. #define KMP_STATS_LOOP_END /* Nothing */
  1880. #endif
  1881. template <typename T>
  1882. static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
  1883. T *p_lb, T *p_ub,
  1884. typename traits_t<T>::signed_t *p_st
  1885. #if OMPT_SUPPORT && OMPT_OPTIONAL
  1886. ,
  1887. void *codeptr
  1888. #endif
  1889. ) {
  1890. typedef typename traits_t<T>::unsigned_t UT;
  1891. typedef typename traits_t<T>::signed_t ST;
  1892. // This is potentially slightly misleading, schedule(runtime) will appear here
  1893. // even if the actual runtime schedule is static. (Which points out a
  1894. // disadvantage of schedule(runtime): even when static scheduling is used it
  1895. // costs more than a compile time choice to use static scheduling would.)
  1896. KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
  1897. int status;
  1898. dispatch_private_info_template<T> *pr;
  1899. __kmp_assert_valid_gtid(gtid);
  1900. kmp_info_t *th = __kmp_threads[gtid];
  1901. kmp_team_t *team = th->th.th_team;
  1902. KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
  1903. KD_TRACE(
  1904. 1000,
  1905. ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
  1906. gtid, p_lb, p_ub, p_st, p_last));
  1907. if (team->t.t_serialized) {
  1908. /* NOTE: serialize this dispatch because we are not at the active level */
  1909. pr = reinterpret_cast<dispatch_private_info_template<T> *>(
  1910. th->th.th_dispatch->th_disp_buffer); /* top of the stack */
  1911. KMP_DEBUG_ASSERT(pr);
  1912. if ((status = (pr->u.p.tc != 0)) == 0) {
  1913. *p_lb = 0;
  1914. *p_ub = 0;
  1915. // if ( p_last != NULL )
  1916. // *p_last = 0;
  1917. if (p_st != NULL)
  1918. *p_st = 0;
  1919. if (__kmp_env_consistency_check) {
  1920. if (pr->pushed_ws != ct_none) {
  1921. pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
  1922. }
  1923. }
  1924. } else if (pr->flags.nomerge) {
  1925. kmp_int32 last;
  1926. T start;
  1927. UT limit, trip, init;
  1928. ST incr;
  1929. T chunk = pr->u.p.parm1;
  1930. KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
  1931. gtid));
  1932. init = chunk * pr->u.p.count++;
  1933. trip = pr->u.p.tc - 1;
  1934. if ((status = (init <= trip)) == 0) {
  1935. *p_lb = 0;
  1936. *p_ub = 0;
  1937. // if ( p_last != NULL )
  1938. // *p_last = 0;
  1939. if (p_st != NULL)
  1940. *p_st = 0;
  1941. if (__kmp_env_consistency_check) {
  1942. if (pr->pushed_ws != ct_none) {
  1943. pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
  1944. }
  1945. }
  1946. } else {
  1947. start = pr->u.p.lb;
  1948. limit = chunk + init - 1;
  1949. incr = pr->u.p.st;
  1950. if ((last = (limit >= trip)) != 0) {
  1951. limit = trip;
  1952. #if KMP_OS_WINDOWS
  1953. pr->u.p.last_upper = pr->u.p.ub;
  1954. #endif /* KMP_OS_WINDOWS */
  1955. }
  1956. if (p_last != NULL)
  1957. *p_last = last;
  1958. if (p_st != NULL)
  1959. *p_st = incr;
  1960. if (incr == 1) {
  1961. *p_lb = start + init;
  1962. *p_ub = start + limit;
  1963. } else {
  1964. *p_lb = start + init * incr;
  1965. *p_ub = start + limit * incr;
  1966. }
  1967. if (pr->flags.ordered) {
  1968. pr->u.p.ordered_lower = init;
  1969. pr->u.p.ordered_upper = limit;
  1970. #ifdef KMP_DEBUG
  1971. {
  1972. char *buff;
  1973. // create format specifiers before the debug output
  1974. buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
  1975. "ordered_lower:%%%s ordered_upper:%%%s\n",
  1976. traits_t<UT>::spec, traits_t<UT>::spec);
  1977. KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
  1978. pr->u.p.ordered_upper));
  1979. __kmp_str_free(&buff);
  1980. }
  1981. #endif
  1982. } // if
  1983. } // if
  1984. } else {
  1985. pr->u.p.tc = 0;
  1986. *p_lb = pr->u.p.lb;
  1987. *p_ub = pr->u.p.ub;
  1988. #if KMP_OS_WINDOWS
  1989. pr->u.p.last_upper = *p_ub;
  1990. #endif /* KMP_OS_WINDOWS */
  1991. if (p_last != NULL)
  1992. *p_last = TRUE;
  1993. if (p_st != NULL)
  1994. *p_st = pr->u.p.st;
  1995. } // if
  1996. #ifdef KMP_DEBUG
  1997. {
  1998. char *buff;
  1999. // create format specifiers before the debug output
  2000. buff = __kmp_str_format(
  2001. "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
  2002. "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
  2003. traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
  2004. KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
  2005. (p_last ? *p_last : 0), status));
  2006. __kmp_str_free(&buff);
  2007. }
  2008. #endif
  2009. #if INCLUDE_SSC_MARKS
  2010. SSC_MARK_DISPATCH_NEXT();
  2011. #endif
  2012. OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
  2013. OMPT_LOOP_END;
  2014. KMP_STATS_LOOP_END;
  2015. return status;
  2016. } else {
  2017. kmp_int32 last = 0;
  2018. dispatch_shared_info_template<T> volatile *sh;
  2019. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  2020. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  2021. pr = reinterpret_cast<dispatch_private_info_template<T> *>(
  2022. th->th.th_dispatch->th_dispatch_pr_current);
  2023. KMP_DEBUG_ASSERT(pr);
  2024. sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
  2025. th->th.th_dispatch->th_dispatch_sh_current);
  2026. KMP_DEBUG_ASSERT(sh);
  2027. #if KMP_USE_HIER_SCHED
  2028. if (pr->flags.use_hier)
  2029. status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
  2030. else
  2031. #endif // KMP_USE_HIER_SCHED
  2032. status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
  2033. p_st, th->th.th_team_nproc,
  2034. th->th.th_info.ds.ds_tid);
  2035. // status == 0: no more iterations to execute
  2036. if (status == 0) {
  2037. ST num_done;
  2038. num_done = test_then_inc<ST>(&sh->u.s.num_done);
  2039. #ifdef KMP_DEBUG
  2040. {
  2041. char *buff;
  2042. // create format specifiers before the debug output
  2043. buff = __kmp_str_format(
  2044. "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
  2045. traits_t<ST>::spec);
  2046. KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
  2047. __kmp_str_free(&buff);
  2048. }
  2049. #endif
  2050. #if KMP_USE_HIER_SCHED
  2051. pr->flags.use_hier = FALSE;
  2052. #endif
  2053. if (num_done == th->th.th_team_nproc - 1) {
  2054. #if KMP_STATIC_STEAL_ENABLED
  2055. if (pr->schedule == kmp_sch_static_steal) {
  2056. int i;
  2057. int idx = (th->th.th_dispatch->th_disp_index - 1) %
  2058. __kmp_dispatch_num_buffers; // current loop index
  2059. // loop complete, safe to destroy locks used for stealing
  2060. for (i = 0; i < th->th.th_team_nproc; ++i) {
  2061. dispatch_private_info_template<T> *buf =
  2062. reinterpret_cast<dispatch_private_info_template<T> *>(
  2063. &team->t.t_dispatch[i].th_disp_buffer[idx]);
  2064. KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
  2065. KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
  2066. if (traits_t<T>::type_size > 4) {
  2067. // destroy locks used for stealing
  2068. kmp_lock_t *lck = buf->u.p.steal_lock;
  2069. KMP_ASSERT(lck != NULL);
  2070. __kmp_destroy_lock(lck);
  2071. __kmp_free(lck);
  2072. buf->u.p.steal_lock = NULL;
  2073. }
  2074. }
  2075. }
  2076. #endif
  2077. /* NOTE: release shared buffer to be reused */
  2078. KMP_MB(); /* Flush all pending memory write invalidates. */
  2079. sh->u.s.num_done = 0;
  2080. sh->u.s.iteration = 0;
  2081. /* TODO replace with general release procedure? */
  2082. if (pr->flags.ordered) {
  2083. sh->u.s.ordered_iteration = 0;
  2084. }
  2085. sh->buffer_index += __kmp_dispatch_num_buffers;
  2086. KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
  2087. gtid, sh->buffer_index));
  2088. KMP_MB(); /* Flush all pending memory write invalidates. */
  2089. } // if
  2090. if (__kmp_env_consistency_check) {
  2091. if (pr->pushed_ws != ct_none) {
  2092. pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
  2093. }
  2094. }
  2095. th->th.th_dispatch->th_deo_fcn = NULL;
  2096. th->th.th_dispatch->th_dxo_fcn = NULL;
  2097. th->th.th_dispatch->th_dispatch_sh_current = NULL;
  2098. th->th.th_dispatch->th_dispatch_pr_current = NULL;
  2099. } // if (status == 0)
  2100. #if KMP_OS_WINDOWS
  2101. else if (last) {
  2102. pr->u.p.last_upper = pr->u.p.ub;
  2103. }
  2104. #endif /* KMP_OS_WINDOWS */
  2105. if (p_last != NULL && status != 0)
  2106. *p_last = last;
  2107. } // if
  2108. #ifdef KMP_DEBUG
  2109. {
  2110. char *buff;
  2111. // create format specifiers before the debug output
  2112. buff = __kmp_str_format(
  2113. "__kmp_dispatch_next: T#%%d normal case: "
  2114. "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
  2115. traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
  2116. KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
  2117. (p_last ? *p_last : 0), status));
  2118. __kmp_str_free(&buff);
  2119. }
  2120. #endif
  2121. #if INCLUDE_SSC_MARKS
  2122. SSC_MARK_DISPATCH_NEXT();
  2123. #endif
  2124. OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
  2125. OMPT_LOOP_END;
  2126. KMP_STATS_LOOP_END;
  2127. return status;
  2128. }
  2129. /*!
  2130. @ingroup WORK_SHARING
  2131. @param loc source location information
  2132. @param global_tid global thread number
  2133. @return Zero if the parallel region is not active and this thread should execute
  2134. all sections, non-zero otherwise.
  2135. Beginning of sections construct.
  2136. There are no implicit barriers in the "sections" calls, rather the compiler
  2137. should introduce an explicit barrier if it is required.
  2138. This implementation is based on __kmp_dispatch_init, using same constructs for
  2139. shared data (we can't have sections nested directly in omp for loop, there
  2140. should be a parallel region in between)
  2141. */
  2142. kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
  2143. int active;
  2144. kmp_info_t *th;
  2145. kmp_team_t *team;
  2146. kmp_uint32 my_buffer_index;
  2147. dispatch_shared_info_template<kmp_int32> volatile *sh;
  2148. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2149. if (!TCR_4(__kmp_init_parallel))
  2150. __kmp_parallel_initialize();
  2151. __kmp_resume_if_soft_paused();
  2152. /* setup data */
  2153. th = __kmp_threads[gtid];
  2154. team = th->th.th_team;
  2155. active = !team->t.t_serialized;
  2156. th->th.th_ident = loc;
  2157. KMP_COUNT_BLOCK(OMP_SECTIONS);
  2158. KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
  2159. if (active) {
  2160. // Setup sections in the same way as dynamic scheduled loops.
  2161. // We need one shared data: which section is to execute next.
  2162. // (in case parallel is not active, all sections will be executed on the
  2163. // same thread)
  2164. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  2165. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  2166. my_buffer_index = th->th.th_dispatch->th_disp_index++;
  2167. // reuse shared data structures from dynamic sched loops:
  2168. sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
  2169. &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
  2170. KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
  2171. my_buffer_index));
  2172. th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
  2173. th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
  2174. KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
  2175. "sh->buffer_index:%d\n",
  2176. gtid, my_buffer_index, sh->buffer_index));
  2177. __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
  2178. __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
  2179. // Note: KMP_WAIT() cannot be used there: buffer index and
  2180. // my_buffer_index are *always* 32-bit integers.
  2181. KMP_MB();
  2182. KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
  2183. "sh->buffer_index:%d\n",
  2184. gtid, my_buffer_index, sh->buffer_index));
  2185. th->th.th_dispatch->th_dispatch_pr_current =
  2186. nullptr; // sections construct doesn't need private data
  2187. th->th.th_dispatch->th_dispatch_sh_current =
  2188. CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
  2189. }
  2190. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2191. if (ompt_enabled.ompt_callback_work) {
  2192. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
  2193. ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
  2194. ompt_callbacks.ompt_callback(ompt_callback_work)(
  2195. ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
  2196. &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
  2197. }
  2198. #endif
  2199. KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
  2200. return active;
  2201. }
  2202. /*!
  2203. @ingroup WORK_SHARING
  2204. @param loc source location information
  2205. @param global_tid global thread number
  2206. @param numberOfSections number of sections in the 'sections' construct
  2207. @return unsigned [from 0 to n) - number (id) of the section to execute next on
  2208. this thread. n (or any other number not in range) - nothing to execute on this
  2209. thread
  2210. */
  2211. kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
  2212. kmp_int32 numberOfSections) {
  2213. KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
  2214. kmp_info_t *th = __kmp_threads[gtid];
  2215. #ifdef KMP_DEBUG
  2216. kmp_team_t *team = th->th.th_team;
  2217. #endif
  2218. KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
  2219. numberOfSections));
  2220. // For serialized case we should not call this function:
  2221. KMP_DEBUG_ASSERT(!team->t.t_serialized);
  2222. dispatch_shared_info_template<kmp_int32> volatile *sh;
  2223. KMP_DEBUG_ASSERT(th->th.th_dispatch ==
  2224. &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  2225. KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
  2226. sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
  2227. th->th.th_dispatch->th_dispatch_sh_current);
  2228. KMP_DEBUG_ASSERT(sh);
  2229. kmp_int32 sectionIndex = 0;
  2230. bool moreSectionsToExecute = true;
  2231. // Find section to execute:
  2232. sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
  2233. if (sectionIndex >= numberOfSections) {
  2234. moreSectionsToExecute = false;
  2235. }
  2236. // status == 0: no more sections to execute;
  2237. // OMPTODO: __kmpc_end_sections could be bypassed?
  2238. if (!moreSectionsToExecute) {
  2239. kmp_int32 num_done;
  2240. num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
  2241. if (num_done == th->th.th_team_nproc - 1) {
  2242. /* NOTE: release this buffer to be reused */
  2243. KMP_MB(); /* Flush all pending memory write invalidates. */
  2244. sh->u.s.num_done = 0;
  2245. sh->u.s.iteration = 0;
  2246. KMP_MB(); /* Flush all pending memory write invalidates. */
  2247. sh->buffer_index += __kmp_dispatch_num_buffers;
  2248. KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
  2249. sh->buffer_index));
  2250. KMP_MB(); /* Flush all pending memory write invalidates. */
  2251. } // if
  2252. th->th.th_dispatch->th_deo_fcn = NULL;
  2253. th->th.th_dispatch->th_dxo_fcn = NULL;
  2254. th->th.th_dispatch->th_dispatch_sh_current = NULL;
  2255. th->th.th_dispatch->th_dispatch_pr_current = NULL;
  2256. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2257. if (ompt_enabled.ompt_callback_dispatch) {
  2258. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
  2259. ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
  2260. ompt_data_t instance = ompt_data_none;
  2261. instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
  2262. ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
  2263. &(team_info->parallel_data), &(task_info->task_data),
  2264. ompt_dispatch_section, instance);
  2265. }
  2266. #endif
  2267. KMP_POP_PARTITIONED_TIMER();
  2268. }
  2269. return sectionIndex;
  2270. }
  2271. /*!
  2272. @ingroup WORK_SHARING
  2273. @param loc source location information
  2274. @param global_tid global thread number
  2275. End of "sections" construct.
  2276. Don't need to wait here: barrier is added separately when needed.
  2277. */
  2278. void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
  2279. kmp_info_t *th = __kmp_threads[gtid];
  2280. int active = !th->th.th_team->t.t_serialized;
  2281. KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
  2282. if (!active) {
  2283. // In active case call finalization is done in __kmpc_next_section
  2284. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2285. if (ompt_enabled.ompt_callback_work) {
  2286. ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
  2287. ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
  2288. ompt_callbacks.ompt_callback(ompt_callback_work)(
  2289. ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
  2290. &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
  2291. }
  2292. #endif
  2293. KMP_POP_PARTITIONED_TIMER();
  2294. }
  2295. KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
  2296. }
  2297. template <typename T>
  2298. static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
  2299. kmp_int32 *plastiter, T *plower, T *pupper,
  2300. typename traits_t<T>::signed_t incr) {
  2301. typedef typename traits_t<T>::unsigned_t UT;
  2302. kmp_uint32 team_id;
  2303. kmp_uint32 nteams;
  2304. UT trip_count;
  2305. kmp_team_t *team;
  2306. kmp_info_t *th;
  2307. KMP_DEBUG_ASSERT(plastiter && plower && pupper);
  2308. KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
  2309. #ifdef KMP_DEBUG
  2310. typedef typename traits_t<T>::signed_t ST;
  2311. {
  2312. char *buff;
  2313. // create format specifiers before the debug output
  2314. buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
  2315. "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
  2316. traits_t<T>::spec, traits_t<T>::spec,
  2317. traits_t<ST>::spec, traits_t<T>::spec);
  2318. KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
  2319. __kmp_str_free(&buff);
  2320. }
  2321. #endif
  2322. if (__kmp_env_consistency_check) {
  2323. if (incr == 0) {
  2324. __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
  2325. loc);
  2326. }
  2327. if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
  2328. // The loop is illegal.
  2329. // Some zero-trip loops maintained by compiler, e.g.:
  2330. // for(i=10;i<0;++i) // lower >= upper - run-time check
  2331. // for(i=0;i>10;--i) // lower <= upper - run-time check
  2332. // for(i=0;i>10;++i) // incr > 0 - compile-time check
  2333. // for(i=10;i<0;--i) // incr < 0 - compile-time check
  2334. // Compiler does not check the following illegal loops:
  2335. // for(i=0;i<10;i+=incr) // where incr<0
  2336. // for(i=10;i>0;i-=incr) // where incr<0
  2337. __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
  2338. }
  2339. }
  2340. __kmp_assert_valid_gtid(gtid);
  2341. th = __kmp_threads[gtid];
  2342. team = th->th.th_team;
  2343. KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
  2344. nteams = th->th.th_teams_size.nteams;
  2345. team_id = team->t.t_master_tid;
  2346. KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
  2347. // compute global trip count
  2348. if (incr == 1) {
  2349. trip_count = *pupper - *plower + 1;
  2350. } else if (incr == -1) {
  2351. trip_count = *plower - *pupper + 1;
  2352. } else if (incr > 0) {
  2353. // upper-lower can exceed the limit of signed type
  2354. trip_count = (UT)(*pupper - *plower) / incr + 1;
  2355. } else {
  2356. trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
  2357. }
  2358. if (trip_count <= nteams) {
  2359. KMP_DEBUG_ASSERT(
  2360. __kmp_static == kmp_sch_static_greedy ||
  2361. __kmp_static ==
  2362. kmp_sch_static_balanced); // Unknown static scheduling type.
  2363. // only some teams get single iteration, others get nothing
  2364. if (team_id < trip_count) {
  2365. *pupper = *plower = *plower + team_id * incr;
  2366. } else {
  2367. *plower = *pupper + incr; // zero-trip loop
  2368. }
  2369. if (plastiter != NULL)
  2370. *plastiter = (team_id == trip_count - 1);
  2371. } else {
  2372. if (__kmp_static == kmp_sch_static_balanced) {
  2373. UT chunk = trip_count / nteams;
  2374. UT extras = trip_count % nteams;
  2375. *plower +=
  2376. incr * (team_id * chunk + (team_id < extras ? team_id : extras));
  2377. *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
  2378. if (plastiter != NULL)
  2379. *plastiter = (team_id == nteams - 1);
  2380. } else {
  2381. T chunk_inc_count =
  2382. (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
  2383. T upper = *pupper;
  2384. KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
  2385. // Unknown static scheduling type.
  2386. *plower += team_id * chunk_inc_count;
  2387. *pupper = *plower + chunk_inc_count - incr;
  2388. // Check/correct bounds if needed
  2389. if (incr > 0) {
  2390. if (*pupper < *plower)
  2391. *pupper = traits_t<T>::max_value;
  2392. if (plastiter != NULL)
  2393. *plastiter = *plower <= upper && *pupper > upper - incr;
  2394. if (*pupper > upper)
  2395. *pupper = upper; // tracker C73258
  2396. } else {
  2397. if (*pupper > *plower)
  2398. *pupper = traits_t<T>::min_value;
  2399. if (plastiter != NULL)
  2400. *plastiter = *plower >= upper && *pupper < upper - incr;
  2401. if (*pupper < upper)
  2402. *pupper = upper; // tracker C73258
  2403. }
  2404. }
  2405. }
  2406. }
  2407. //-----------------------------------------------------------------------------
  2408. // Dispatch routines
  2409. // Transfer call to template< type T >
  2410. // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
  2411. // T lb, T ub, ST st, ST chunk )
  2412. extern "C" {
  2413. /*!
  2414. @ingroup WORK_SHARING
  2415. @{
  2416. @param loc Source location
  2417. @param gtid Global thread id
  2418. @param schedule Schedule type
  2419. @param lb Lower bound
  2420. @param ub Upper bound
  2421. @param st Step (or increment if you prefer)
  2422. @param chunk The chunk size to block with
  2423. This function prepares the runtime to start a dynamically scheduled for loop,
  2424. saving the loop arguments.
  2425. These functions are all identical apart from the types of the arguments.
  2426. */
  2427. void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
  2428. enum sched_type schedule, kmp_int32 lb,
  2429. kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
  2430. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2431. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2432. OMPT_STORE_RETURN_ADDRESS(gtid);
  2433. #endif
  2434. __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2435. }
  2436. /*!
  2437. See @ref __kmpc_dispatch_init_4
  2438. */
  2439. void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
  2440. enum sched_type schedule, kmp_uint32 lb,
  2441. kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
  2442. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2443. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2444. OMPT_STORE_RETURN_ADDRESS(gtid);
  2445. #endif
  2446. __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2447. }
  2448. /*!
  2449. See @ref __kmpc_dispatch_init_4
  2450. */
  2451. void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
  2452. enum sched_type schedule, kmp_int64 lb,
  2453. kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
  2454. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2455. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2456. OMPT_STORE_RETURN_ADDRESS(gtid);
  2457. #endif
  2458. __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2459. }
  2460. /*!
  2461. See @ref __kmpc_dispatch_init_4
  2462. */
  2463. void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
  2464. enum sched_type schedule, kmp_uint64 lb,
  2465. kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
  2466. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2467. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2468. OMPT_STORE_RETURN_ADDRESS(gtid);
  2469. #endif
  2470. __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2471. }
  2472. /*!
  2473. See @ref __kmpc_dispatch_init_4
  2474. Difference from __kmpc_dispatch_init set of functions is these functions
  2475. are called for composite distribute parallel for construct. Thus before
  2476. regular iterations dispatching we need to calc per-team iteration space.
  2477. These functions are all identical apart from the types of the arguments.
  2478. */
  2479. void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
  2480. enum sched_type schedule, kmp_int32 *p_last,
  2481. kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
  2482. kmp_int32 chunk) {
  2483. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2484. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2485. OMPT_STORE_RETURN_ADDRESS(gtid);
  2486. #endif
  2487. __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
  2488. __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2489. }
  2490. void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
  2491. enum sched_type schedule, kmp_int32 *p_last,
  2492. kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
  2493. kmp_int32 chunk) {
  2494. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2495. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2496. OMPT_STORE_RETURN_ADDRESS(gtid);
  2497. #endif
  2498. __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
  2499. __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2500. }
  2501. void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
  2502. enum sched_type schedule, kmp_int32 *p_last,
  2503. kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
  2504. kmp_int64 chunk) {
  2505. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2506. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2507. OMPT_STORE_RETURN_ADDRESS(gtid);
  2508. #endif
  2509. __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
  2510. __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2511. }
  2512. void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
  2513. enum sched_type schedule, kmp_int32 *p_last,
  2514. kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
  2515. kmp_int64 chunk) {
  2516. KMP_DEBUG_ASSERT(__kmp_init_serial);
  2517. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2518. OMPT_STORE_RETURN_ADDRESS(gtid);
  2519. #endif
  2520. __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
  2521. __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
  2522. }
  2523. /*!
  2524. @param loc Source code location
  2525. @param gtid Global thread id
  2526. @param p_last Pointer to a flag set to one if this is the last chunk or zero
  2527. otherwise
  2528. @param p_lb Pointer to the lower bound for the next chunk of work
  2529. @param p_ub Pointer to the upper bound for the next chunk of work
  2530. @param p_st Pointer to the stride for the next chunk of work
  2531. @return one if there is work to be done, zero otherwise
  2532. Get the next dynamically allocated chunk of work for this thread.
  2533. If there is no more work, then the lb,ub and stride need not be modified.
  2534. */
  2535. int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
  2536. kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
  2537. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2538. OMPT_STORE_RETURN_ADDRESS(gtid);
  2539. #endif
  2540. return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
  2541. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2542. ,
  2543. OMPT_LOAD_RETURN_ADDRESS(gtid)
  2544. #endif
  2545. );
  2546. }
  2547. /*!
  2548. See @ref __kmpc_dispatch_next_4
  2549. */
  2550. int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
  2551. kmp_uint32 *p_lb, kmp_uint32 *p_ub,
  2552. kmp_int32 *p_st) {
  2553. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2554. OMPT_STORE_RETURN_ADDRESS(gtid);
  2555. #endif
  2556. return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
  2557. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2558. ,
  2559. OMPT_LOAD_RETURN_ADDRESS(gtid)
  2560. #endif
  2561. );
  2562. }
  2563. /*!
  2564. See @ref __kmpc_dispatch_next_4
  2565. */
  2566. int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
  2567. kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
  2568. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2569. OMPT_STORE_RETURN_ADDRESS(gtid);
  2570. #endif
  2571. return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
  2572. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2573. ,
  2574. OMPT_LOAD_RETURN_ADDRESS(gtid)
  2575. #endif
  2576. );
  2577. }
  2578. /*!
  2579. See @ref __kmpc_dispatch_next_4
  2580. */
  2581. int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
  2582. kmp_uint64 *p_lb, kmp_uint64 *p_ub,
  2583. kmp_int64 *p_st) {
  2584. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2585. OMPT_STORE_RETURN_ADDRESS(gtid);
  2586. #endif
  2587. return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
  2588. #if OMPT_SUPPORT && OMPT_OPTIONAL
  2589. ,
  2590. OMPT_LOAD_RETURN_ADDRESS(gtid)
  2591. #endif
  2592. );
  2593. }
  2594. /*!
  2595. @param loc Source code location
  2596. @param gtid Global thread id
  2597. Mark the end of a dynamic loop.
  2598. */
  2599. void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
  2600. __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
  2601. }
  2602. /*!
  2603. See @ref __kmpc_dispatch_fini_4
  2604. */
  2605. void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
  2606. __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
  2607. }
  2608. /*!
  2609. See @ref __kmpc_dispatch_fini_4
  2610. */
  2611. void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
  2612. __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
  2613. }
  2614. /*!
  2615. See @ref __kmpc_dispatch_fini_4
  2616. */
  2617. void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
  2618. __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
  2619. }
  2620. /*! @} */
  2621. //-----------------------------------------------------------------------------
  2622. // Non-template routines from kmp_dispatch.cpp used in other sources
  2623. kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
  2624. return value == checker;
  2625. }
  2626. kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
  2627. return value != checker;
  2628. }
  2629. kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
  2630. return value < checker;
  2631. }
  2632. kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
  2633. return value >= checker;
  2634. }
  2635. kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
  2636. return value <= checker;
  2637. }
  2638. kmp_uint32
  2639. __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
  2640. kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
  2641. void *obj // Higher-level synchronization object, or NULL.
  2642. ) {
  2643. // note: we may not belong to a team at this point
  2644. volatile kmp_uint32 *spin = spinner;
  2645. kmp_uint32 check = checker;
  2646. kmp_uint32 spins;
  2647. kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
  2648. kmp_uint32 r;
  2649. kmp_uint64 time;
  2650. KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
  2651. KMP_INIT_YIELD(spins);
  2652. KMP_INIT_BACKOFF(time);
  2653. // main wait spin loop
  2654. while (!f(r = TCR_4(*spin), check)) {
  2655. KMP_FSYNC_SPIN_PREPARE(obj);
  2656. /* GEH - remove this since it was accidentally introduced when kmp_wait was
  2657. split. It causes problems with infinite recursion because of exit lock */
  2658. /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
  2659. __kmp_abort_thread(); */
  2660. KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
  2661. }
  2662. KMP_FSYNC_SPIN_ACQUIRED(obj);
  2663. return r;
  2664. }
  2665. void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
  2666. kmp_uint32 (*pred)(void *, kmp_uint32),
  2667. void *obj // Higher-level synchronization object, or NULL.
  2668. ) {
  2669. // note: we may not belong to a team at this point
  2670. void *spin = spinner;
  2671. kmp_uint32 check = checker;
  2672. kmp_uint32 spins;
  2673. kmp_uint32 (*f)(void *, kmp_uint32) = pred;
  2674. kmp_uint64 time;
  2675. KMP_FSYNC_SPIN_INIT(obj, spin);
  2676. KMP_INIT_YIELD(spins);
  2677. KMP_INIT_BACKOFF(time);
  2678. // main wait spin loop
  2679. while (!f(spin, check)) {
  2680. KMP_FSYNC_SPIN_PREPARE(obj);
  2681. /* if we have waited a bit, or are noversubscribed, yield */
  2682. /* pause is in the following code */
  2683. KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
  2684. }
  2685. KMP_FSYNC_SPIN_ACQUIRED(obj);
  2686. }
  2687. } // extern "C"
  2688. #ifdef KMP_GOMP_COMPAT
  2689. void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
  2690. enum sched_type schedule, kmp_int32 lb,
  2691. kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
  2692. int push_ws) {
  2693. __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
  2694. push_ws);
  2695. }
  2696. void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
  2697. enum sched_type schedule, kmp_uint32 lb,
  2698. kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
  2699. int push_ws) {
  2700. __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
  2701. push_ws);
  2702. }
  2703. void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
  2704. enum sched_type schedule, kmp_int64 lb,
  2705. kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
  2706. int push_ws) {
  2707. __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
  2708. push_ws);
  2709. }
  2710. void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
  2711. enum sched_type schedule, kmp_uint64 lb,
  2712. kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
  2713. int push_ws) {
  2714. __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
  2715. push_ws);
  2716. }
  2717. void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
  2718. __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
  2719. }
  2720. void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
  2721. __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
  2722. }
  2723. void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
  2724. __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
  2725. }
  2726. void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
  2727. __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
  2728. }
  2729. #endif /* KMP_GOMP_COMPAT */
  2730. /* ------------------------------------------------------------------------ */