kmp_barrier.cpp 106 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670
  1. /*
  2. * kmp_barrier.cpp
  3. */
  4. //===----------------------------------------------------------------------===//
  5. //
  6. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  7. // See https://llvm.org/LICENSE.txt for license information.
  8. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  9. //
  10. //===----------------------------------------------------------------------===//
  11. #include "kmp_wait_release.h"
  12. #include "kmp_barrier.h"
  13. #include "kmp_itt.h"
  14. #include "kmp_os.h"
  15. #include "kmp_stats.h"
  16. #include "ompt-specific.h"
  17. // for distributed barrier
  18. #include "kmp_affinity.h"
  19. #if KMP_MIC
  20. #include <immintrin.h>
  21. #define USE_NGO_STORES 1
  22. #endif // KMP_MIC
  23. #if KMP_MIC && USE_NGO_STORES
  24. // ICV copying
  25. #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
  26. #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
  27. #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
  28. #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
  29. #else
  30. #define ngo_load(src) ((void)0)
  31. #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
  32. #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
  33. #define ngo_sync() ((void)0)
  34. #endif /* KMP_MIC && USE_NGO_STORES */
  35. void __kmp_print_structure(void); // Forward declaration
  36. // ---------------------------- Barrier Algorithms ----------------------------
  37. // Distributed barrier
  38. // Compute how many threads to have polling each cache-line.
  39. // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
  40. void distributedBarrier::computeVarsForN(size_t n) {
  41. int nsockets = 1;
  42. if (__kmp_topology) {
  43. int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
  44. int core_level = __kmp_topology->get_level(KMP_HW_CORE);
  45. int ncores_per_socket =
  46. __kmp_topology->calculate_ratio(core_level, socket_level);
  47. nsockets = __kmp_topology->get_count(socket_level);
  48. if (nsockets <= 0)
  49. nsockets = 1;
  50. if (ncores_per_socket <= 0)
  51. ncores_per_socket = 1;
  52. threads_per_go = ncores_per_socket >> 1;
  53. if (!fix_threads_per_go) {
  54. // Minimize num_gos
  55. if (threads_per_go > 4) {
  56. if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
  57. threads_per_go = threads_per_go >> 1;
  58. }
  59. if (threads_per_go > 4 && nsockets == 1)
  60. threads_per_go = threads_per_go >> 1;
  61. }
  62. }
  63. if (threads_per_go == 0)
  64. threads_per_go = 1;
  65. fix_threads_per_go = true;
  66. num_gos = n / threads_per_go;
  67. if (n % threads_per_go)
  68. num_gos++;
  69. if (nsockets == 1 || num_gos == 1)
  70. num_groups = 1;
  71. else {
  72. num_groups = num_gos / nsockets;
  73. if (num_gos % nsockets)
  74. num_groups++;
  75. }
  76. if (num_groups <= 0)
  77. num_groups = 1;
  78. gos_per_group = num_gos / num_groups;
  79. if (num_gos % num_groups)
  80. gos_per_group++;
  81. threads_per_group = threads_per_go * gos_per_group;
  82. } else {
  83. num_gos = n / threads_per_go;
  84. if (n % threads_per_go)
  85. num_gos++;
  86. if (num_gos == 1)
  87. num_groups = 1;
  88. else {
  89. num_groups = num_gos / 2;
  90. if (num_gos % 2)
  91. num_groups++;
  92. }
  93. gos_per_group = num_gos / num_groups;
  94. if (num_gos % num_groups)
  95. gos_per_group++;
  96. threads_per_group = threads_per_go * gos_per_group;
  97. }
  98. }
  99. void distributedBarrier::computeGo(size_t n) {
  100. // Minimize num_gos
  101. for (num_gos = 1;; num_gos++)
  102. if (IDEAL_CONTENTION * num_gos >= n)
  103. break;
  104. threads_per_go = n / num_gos;
  105. if (n % num_gos)
  106. threads_per_go++;
  107. while (num_gos > MAX_GOS) {
  108. threads_per_go++;
  109. num_gos = n / threads_per_go;
  110. if (n % threads_per_go)
  111. num_gos++;
  112. }
  113. computeVarsForN(n);
  114. }
  115. // This function is to resize the barrier arrays when the new number of threads
  116. // exceeds max_threads, which is the current size of all the arrays
  117. void distributedBarrier::resize(size_t nthr) {
  118. KMP_DEBUG_ASSERT(nthr > max_threads);
  119. // expand to requested size * 2
  120. max_threads = nthr * 2;
  121. // allocate arrays to new max threads
  122. for (int i = 0; i < MAX_ITERS; ++i) {
  123. if (flags[i])
  124. flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
  125. max_threads * sizeof(flags_s));
  126. else
  127. flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
  128. }
  129. if (go)
  130. go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
  131. else
  132. go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
  133. if (iter)
  134. iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
  135. else
  136. iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
  137. if (sleep)
  138. sleep =
  139. (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
  140. else
  141. sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
  142. }
  143. // This function is to set all the go flags that threads might be waiting
  144. // on, and when blocktime is not infinite, it should be followed by a wake-up
  145. // call to each thread
  146. kmp_uint64 distributedBarrier::go_release() {
  147. kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
  148. for (size_t j = 0; j < num_gos; j++) {
  149. go[j].go.store(next_go);
  150. }
  151. return next_go;
  152. }
  153. void distributedBarrier::go_reset() {
  154. for (size_t j = 0; j < max_threads; ++j) {
  155. for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
  156. flags[i][j].stillNeed = 1;
  157. }
  158. go[j].go.store(0);
  159. iter[j].iter = 0;
  160. }
  161. }
  162. // This function inits/re-inits the distributed barrier for a particular number
  163. // of threads. If a resize of arrays is needed, it calls the resize function.
  164. void distributedBarrier::init(size_t nthr) {
  165. size_t old_max = max_threads;
  166. if (nthr > max_threads) { // need more space in arrays
  167. resize(nthr);
  168. }
  169. for (size_t i = 0; i < max_threads; i++) {
  170. for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
  171. flags[j][i].stillNeed = 1;
  172. }
  173. go[i].go.store(0);
  174. iter[i].iter = 0;
  175. if (i >= old_max)
  176. sleep[i].sleep = false;
  177. }
  178. // Recalculate num_gos, etc. based on new nthr
  179. computeVarsForN(nthr);
  180. num_threads = nthr;
  181. if (team_icvs == NULL)
  182. team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
  183. }
  184. // This function is used only when KMP_BLOCKTIME is not infinite.
  185. // static
  186. void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
  187. size_t start, size_t stop, size_t inc,
  188. size_t tid) {
  189. KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
  190. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  191. return;
  192. kmp_info_t **other_threads = team->t.t_threads;
  193. for (size_t thr = start; thr < stop; thr += inc) {
  194. KMP_DEBUG_ASSERT(other_threads[thr]);
  195. int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
  196. // Wake up worker regardless of if it appears to be sleeping or not
  197. __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
  198. }
  199. }
  200. static void __kmp_dist_barrier_gather(
  201. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  202. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  203. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
  204. kmp_team_t *team;
  205. distributedBarrier *b;
  206. kmp_info_t **other_threads;
  207. kmp_uint64 my_current_iter, my_next_iter;
  208. kmp_uint32 nproc;
  209. bool group_leader;
  210. team = this_thr->th.th_team;
  211. nproc = this_thr->th.th_team_nproc;
  212. other_threads = team->t.t_threads;
  213. b = team->t.b;
  214. my_current_iter = b->iter[tid].iter;
  215. my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
  216. group_leader = ((tid % b->threads_per_group) == 0);
  217. KA_TRACE(20,
  218. ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
  219. gtid, team->t.t_id, tid, bt));
  220. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  221. // Barrier imbalance - save arrive time to the thread
  222. if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
  223. this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
  224. __itt_get_timestamp();
  225. }
  226. #endif
  227. if (group_leader) {
  228. // Start from the thread after the group leader
  229. size_t group_start = tid + 1;
  230. size_t group_end = tid + b->threads_per_group;
  231. size_t threads_pending = 0;
  232. if (group_end > nproc)
  233. group_end = nproc;
  234. do { // wait for threads in my group
  235. threads_pending = 0;
  236. // Check all the flags every time to avoid branch misspredict
  237. for (size_t thr = group_start; thr < group_end; thr++) {
  238. // Each thread uses a different cache line
  239. threads_pending += b->flags[my_current_iter][thr].stillNeed;
  240. }
  241. // Execute tasks here
  242. if (__kmp_tasking_mode != tskm_immediate_exec) {
  243. kmp_task_team_t *task_team = this_thr->th.th_task_team;
  244. if (task_team != NULL) {
  245. if (TCR_SYNC_4(task_team->tt.tt_active)) {
  246. if (KMP_TASKING_ENABLED(task_team)) {
  247. int tasks_completed = FALSE;
  248. __kmp_atomic_execute_tasks_64(
  249. this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
  250. &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
  251. } else
  252. this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
  253. }
  254. } else {
  255. this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
  256. } // if
  257. }
  258. if (TCR_4(__kmp_global.g.g_done)) {
  259. if (__kmp_global.g.g_abort)
  260. __kmp_abort_thread();
  261. break;
  262. } else if (__kmp_tasking_mode != tskm_immediate_exec &&
  263. this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
  264. this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
  265. }
  266. } while (threads_pending > 0);
  267. if (reduce) { // Perform reduction if needed
  268. OMPT_REDUCTION_DECL(this_thr, gtid);
  269. OMPT_REDUCTION_BEGIN;
  270. // Group leader reduces all threads in group
  271. for (size_t thr = group_start; thr < group_end; thr++) {
  272. (*reduce)(this_thr->th.th_local.reduce_data,
  273. other_threads[thr]->th.th_local.reduce_data);
  274. }
  275. OMPT_REDUCTION_END;
  276. }
  277. // Set flag for next iteration
  278. b->flags[my_next_iter][tid].stillNeed = 1;
  279. // Each thread uses a different cache line; resets stillNeed to 0 to
  280. // indicate it has reached the barrier
  281. b->flags[my_current_iter][tid].stillNeed = 0;
  282. do { // wait for all group leaders
  283. threads_pending = 0;
  284. for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
  285. threads_pending += b->flags[my_current_iter][thr].stillNeed;
  286. }
  287. // Execute tasks here
  288. if (__kmp_tasking_mode != tskm_immediate_exec) {
  289. kmp_task_team_t *task_team = this_thr->th.th_task_team;
  290. if (task_team != NULL) {
  291. if (TCR_SYNC_4(task_team->tt.tt_active)) {
  292. if (KMP_TASKING_ENABLED(task_team)) {
  293. int tasks_completed = FALSE;
  294. __kmp_atomic_execute_tasks_64(
  295. this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
  296. &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
  297. } else
  298. this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
  299. }
  300. } else {
  301. this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
  302. } // if
  303. }
  304. if (TCR_4(__kmp_global.g.g_done)) {
  305. if (__kmp_global.g.g_abort)
  306. __kmp_abort_thread();
  307. break;
  308. } else if (__kmp_tasking_mode != tskm_immediate_exec &&
  309. this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
  310. this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
  311. }
  312. } while (threads_pending > 0);
  313. if (reduce) { // Perform reduction if needed
  314. if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
  315. OMPT_REDUCTION_DECL(this_thr, gtid);
  316. OMPT_REDUCTION_BEGIN;
  317. for (size_t thr = b->threads_per_group; thr < nproc;
  318. thr += b->threads_per_group) {
  319. (*reduce)(this_thr->th.th_local.reduce_data,
  320. other_threads[thr]->th.th_local.reduce_data);
  321. }
  322. OMPT_REDUCTION_END;
  323. }
  324. }
  325. } else {
  326. // Set flag for next iteration
  327. b->flags[my_next_iter][tid].stillNeed = 1;
  328. // Each thread uses a different cache line; resets stillNeed to 0 to
  329. // indicate it has reached the barrier
  330. b->flags[my_current_iter][tid].stillNeed = 0;
  331. }
  332. KMP_MFENCE();
  333. KA_TRACE(20,
  334. ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
  335. gtid, team->t.t_id, tid, bt));
  336. }
  337. static void __kmp_dist_barrier_release(
  338. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  339. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  340. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
  341. kmp_team_t *team;
  342. distributedBarrier *b;
  343. kmp_bstate_t *thr_bar;
  344. kmp_uint64 my_current_iter, next_go;
  345. size_t my_go_index;
  346. bool group_leader;
  347. KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
  348. gtid, tid, bt));
  349. thr_bar = &this_thr->th.th_bar[bt].bb;
  350. if (!KMP_MASTER_TID(tid)) {
  351. // workers and non-master group leaders need to check their presence in team
  352. do {
  353. if (this_thr->th.th_used_in_team.load() != 1 &&
  354. this_thr->th.th_used_in_team.load() != 3) {
  355. // Thread is not in use in a team. Wait on location in tid's thread
  356. // struct. The 0 value tells anyone looking that this thread is spinning
  357. // or sleeping until this location becomes 3 again; 3 is the transition
  358. // state to get to 1 which is waiting on go and being in the team
  359. kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
  360. if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
  361. 0) ||
  362. this_thr->th.th_used_in_team.load() == 0) {
  363. my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
  364. }
  365. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  366. if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
  367. // In fork barrier where we could not get the object reliably
  368. itt_sync_obj =
  369. __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
  370. // Cancel wait on previous parallel region...
  371. __kmp_itt_task_starting(itt_sync_obj);
  372. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  373. return;
  374. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  375. if (itt_sync_obj != NULL)
  376. // Call prepare as early as possible for "new" barrier
  377. __kmp_itt_task_finished(itt_sync_obj);
  378. } else
  379. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  380. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  381. return;
  382. }
  383. if (this_thr->th.th_used_in_team.load() != 1 &&
  384. this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
  385. continue;
  386. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  387. return;
  388. // At this point, the thread thinks it is in use in a team, or in
  389. // transition to be used in a team, but it might have reached this barrier
  390. // before it was marked unused by the team. Unused threads are awoken and
  391. // shifted to wait on local thread struct elsewhere. It also might reach
  392. // this point by being picked up for use by a different team. Either way,
  393. // we need to update the tid.
  394. tid = __kmp_tid_from_gtid(gtid);
  395. team = this_thr->th.th_team;
  396. KMP_DEBUG_ASSERT(tid >= 0);
  397. KMP_DEBUG_ASSERT(team);
  398. b = team->t.b;
  399. my_current_iter = b->iter[tid].iter;
  400. next_go = my_current_iter + distributedBarrier::MAX_ITERS;
  401. my_go_index = tid / b->threads_per_go;
  402. if (this_thr->th.th_used_in_team.load() == 3) {
  403. KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
  404. }
  405. // Check if go flag is set
  406. if (b->go[my_go_index].go.load() != next_go) {
  407. // Wait on go flag on team
  408. kmp_atomic_flag_64<false, true> my_flag(
  409. &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
  410. my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
  411. KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
  412. b->iter[tid].iter == 0);
  413. KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
  414. }
  415. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  416. return;
  417. // At this point, the thread's go location was set. This means the primary
  418. // thread is safely in the barrier, and so this thread's data is
  419. // up-to-date, but we should check again that this thread is really in
  420. // use in the team, as it could have been woken up for the purpose of
  421. // changing team size, or reaping threads at shutdown.
  422. if (this_thr->th.th_used_in_team.load() == 1)
  423. break;
  424. } while (1);
  425. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  426. return;
  427. group_leader = ((tid % b->threads_per_group) == 0);
  428. if (group_leader) {
  429. // Tell all the threads in my group they can go!
  430. for (size_t go_idx = my_go_index + 1;
  431. go_idx < my_go_index + b->gos_per_group; go_idx++) {
  432. b->go[go_idx].go.store(next_go);
  433. }
  434. // Fence added so that workers can see changes to go. sfence inadequate.
  435. KMP_MFENCE();
  436. }
  437. #if KMP_BARRIER_ICV_PUSH
  438. if (propagate_icvs) { // copy ICVs to final dest
  439. __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
  440. tid, FALSE);
  441. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  442. (kmp_internal_control_t *)team->t.b->team_icvs);
  443. copy_icvs(&thr_bar->th_fixed_icvs,
  444. &team->t.t_implicit_task_taskdata[tid].td_icvs);
  445. }
  446. #endif
  447. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
  448. // This thread is now awake and participating in the barrier;
  449. // wake up the other threads in the group
  450. size_t nproc = this_thr->th.th_team_nproc;
  451. size_t group_end = tid + b->threads_per_group;
  452. if (nproc < group_end)
  453. group_end = nproc;
  454. __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
  455. }
  456. } else { // Primary thread
  457. team = this_thr->th.th_team;
  458. b = team->t.b;
  459. my_current_iter = b->iter[tid].iter;
  460. next_go = my_current_iter + distributedBarrier::MAX_ITERS;
  461. #if KMP_BARRIER_ICV_PUSH
  462. if (propagate_icvs) {
  463. // primary thread has ICVs in final destination; copy
  464. copy_icvs(&thr_bar->th_fixed_icvs,
  465. &team->t.t_implicit_task_taskdata[tid].td_icvs);
  466. }
  467. #endif
  468. // Tell all the group leaders they can go!
  469. for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
  470. b->go[go_idx].go.store(next_go);
  471. }
  472. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
  473. // Wake-up the group leaders
  474. size_t nproc = this_thr->th.th_team_nproc;
  475. __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
  476. b->threads_per_group, tid);
  477. }
  478. // Tell all the threads in my group they can go!
  479. for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
  480. b->go[go_idx].go.store(next_go);
  481. }
  482. // Fence added so that workers can see changes to go. sfence inadequate.
  483. KMP_MFENCE();
  484. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
  485. // Wake-up the other threads in my group
  486. size_t nproc = this_thr->th.th_team_nproc;
  487. size_t group_end = tid + b->threads_per_group;
  488. if (nproc < group_end)
  489. group_end = nproc;
  490. __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
  491. }
  492. }
  493. // Update to next iteration
  494. KMP_ASSERT(my_current_iter == b->iter[tid].iter);
  495. b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
  496. KA_TRACE(
  497. 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
  498. gtid, team->t.t_id, tid, bt));
  499. }
  500. // Linear Barrier
  501. template <bool cancellable = false>
  502. static bool __kmp_linear_barrier_gather_template(
  503. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  504. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  505. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
  506. kmp_team_t *team = this_thr->th.th_team;
  507. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  508. kmp_info_t **other_threads = team->t.t_threads;
  509. KA_TRACE(
  510. 20,
  511. ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
  512. gtid, team->t.t_id, tid, bt));
  513. KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
  514. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  515. // Barrier imbalance - save arrive time to the thread
  516. if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
  517. this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
  518. __itt_get_timestamp();
  519. }
  520. #endif
  521. // We now perform a linear reduction to signal that all of the threads have
  522. // arrived.
  523. if (!KMP_MASTER_TID(tid)) {
  524. KA_TRACE(20,
  525. ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
  526. "arrived(%p): %llu => %llu\n",
  527. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
  528. team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
  529. thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
  530. // Mark arrival to primary thread
  531. /* After performing this write, a worker thread may not assume that the team
  532. is valid any more - it could be deallocated by the primary thread at any
  533. time. */
  534. kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
  535. flag.release();
  536. } else {
  537. kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
  538. int nproc = this_thr->th.th_team_nproc;
  539. int i;
  540. // Don't have to worry about sleep bit here or atomic since team setting
  541. kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
  542. // Collect all the worker team member threads.
  543. for (i = 1; i < nproc; ++i) {
  544. #if KMP_CACHE_MANAGE
  545. // Prefetch next thread's arrived count
  546. if (i + 1 < nproc)
  547. KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
  548. #endif /* KMP_CACHE_MANAGE */
  549. KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
  550. "arrived(%p) == %llu\n",
  551. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
  552. team->t.t_id, i,
  553. &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
  554. // Wait for worker thread to arrive
  555. if (cancellable) {
  556. kmp_flag_64<true, false> flag(
  557. &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
  558. if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
  559. return true;
  560. } else {
  561. kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
  562. new_state);
  563. flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  564. }
  565. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  566. // Barrier imbalance - write min of the thread time and the other thread
  567. // time to the thread.
  568. if (__kmp_forkjoin_frames_mode == 2) {
  569. this_thr->th.th_bar_min_time = KMP_MIN(
  570. this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
  571. }
  572. #endif
  573. if (reduce) {
  574. KA_TRACE(100,
  575. ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
  576. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
  577. team->t.t_id, i));
  578. OMPT_REDUCTION_DECL(this_thr, gtid);
  579. OMPT_REDUCTION_BEGIN;
  580. (*reduce)(this_thr->th.th_local.reduce_data,
  581. other_threads[i]->th.th_local.reduce_data);
  582. OMPT_REDUCTION_END;
  583. }
  584. }
  585. // Don't have to worry about sleep bit here or atomic since team setting
  586. team_bar->b_arrived = new_state;
  587. KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
  588. "arrived(%p) = %llu\n",
  589. gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
  590. new_state));
  591. }
  592. KA_TRACE(
  593. 20,
  594. ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
  595. gtid, team->t.t_id, tid, bt));
  596. return false;
  597. }
  598. template <bool cancellable = false>
  599. static bool __kmp_linear_barrier_release_template(
  600. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  601. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  602. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
  603. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  604. kmp_team_t *team;
  605. if (KMP_MASTER_TID(tid)) {
  606. unsigned int i;
  607. kmp_uint32 nproc = this_thr->th.th_team_nproc;
  608. kmp_info_t **other_threads;
  609. team = __kmp_threads[gtid]->th.th_team;
  610. KMP_DEBUG_ASSERT(team != NULL);
  611. other_threads = team->t.t_threads;
  612. KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
  613. "barrier type %d\n",
  614. gtid, team->t.t_id, tid, bt));
  615. if (nproc > 1) {
  616. #if KMP_BARRIER_ICV_PUSH
  617. {
  618. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
  619. if (propagate_icvs) {
  620. ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
  621. for (i = 1; i < nproc; ++i) {
  622. __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
  623. team, i, FALSE);
  624. ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
  625. &team->t.t_implicit_task_taskdata[0].td_icvs);
  626. }
  627. ngo_sync();
  628. }
  629. }
  630. #endif // KMP_BARRIER_ICV_PUSH
  631. // Now, release all of the worker threads
  632. for (i = 1; i < nproc; ++i) {
  633. #if KMP_CACHE_MANAGE
  634. // Prefetch next thread's go flag
  635. if (i + 1 < nproc)
  636. KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
  637. #endif /* KMP_CACHE_MANAGE */
  638. KA_TRACE(
  639. 20,
  640. ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
  641. "go(%p): %u => %u\n",
  642. gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
  643. team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
  644. other_threads[i]->th.th_bar[bt].bb.b_go,
  645. other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
  646. kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
  647. other_threads[i]);
  648. flag.release();
  649. }
  650. }
  651. } else { // Wait for the PRIMARY thread to release us
  652. KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
  653. gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
  654. if (cancellable) {
  655. kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
  656. if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
  657. return true;
  658. } else {
  659. kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
  660. flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  661. }
  662. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  663. if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
  664. // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
  665. // disabled)
  666. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
  667. // Cancel wait on previous parallel region...
  668. __kmp_itt_task_starting(itt_sync_obj);
  669. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  670. return false;
  671. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  672. if (itt_sync_obj != NULL)
  673. // Call prepare as early as possible for "new" barrier
  674. __kmp_itt_task_finished(itt_sync_obj);
  675. } else
  676. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  677. // Early exit for reaping threads releasing forkjoin barrier
  678. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  679. return false;
  680. // The worker thread may now assume that the team is valid.
  681. #ifdef KMP_DEBUG
  682. tid = __kmp_tid_from_gtid(gtid);
  683. team = __kmp_threads[gtid]->th.th_team;
  684. #endif
  685. KMP_DEBUG_ASSERT(team != NULL);
  686. TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
  687. KA_TRACE(20,
  688. ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
  689. gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
  690. KMP_MB(); // Flush all pending memory write invalidates.
  691. }
  692. KA_TRACE(
  693. 20,
  694. ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
  695. gtid, team->t.t_id, tid, bt));
  696. return false;
  697. }
  698. static void __kmp_linear_barrier_gather(
  699. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  700. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  701. __kmp_linear_barrier_gather_template<false>(
  702. bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  703. }
  704. static bool __kmp_linear_barrier_gather_cancellable(
  705. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  706. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  707. return __kmp_linear_barrier_gather_template<true>(
  708. bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  709. }
  710. static void __kmp_linear_barrier_release(
  711. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  712. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  713. __kmp_linear_barrier_release_template<false>(
  714. bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
  715. }
  716. static bool __kmp_linear_barrier_release_cancellable(
  717. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  718. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  719. return __kmp_linear_barrier_release_template<true>(
  720. bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
  721. }
  722. // Tree barrier
  723. static void __kmp_tree_barrier_gather(
  724. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  725. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  726. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
  727. kmp_team_t *team = this_thr->th.th_team;
  728. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  729. kmp_info_t **other_threads = team->t.t_threads;
  730. kmp_uint32 nproc = this_thr->th.th_team_nproc;
  731. kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
  732. kmp_uint32 branch_factor = 1 << branch_bits;
  733. kmp_uint32 child;
  734. kmp_uint32 child_tid;
  735. kmp_uint64 new_state = 0;
  736. KA_TRACE(
  737. 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
  738. gtid, team->t.t_id, tid, bt));
  739. KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
  740. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  741. // Barrier imbalance - save arrive time to the thread
  742. if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
  743. this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
  744. __itt_get_timestamp();
  745. }
  746. #endif
  747. // Perform tree gather to wait until all threads have arrived; reduce any
  748. // required data as we go
  749. child_tid = (tid << branch_bits) + 1;
  750. if (child_tid < nproc) {
  751. // Parent threads wait for all their children to arrive
  752. new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
  753. child = 1;
  754. do {
  755. kmp_info_t *child_thr = other_threads[child_tid];
  756. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  757. #if KMP_CACHE_MANAGE
  758. // Prefetch next thread's arrived count
  759. if (child + 1 <= branch_factor && child_tid + 1 < nproc)
  760. KMP_CACHE_PREFETCH(
  761. &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
  762. #endif /* KMP_CACHE_MANAGE */
  763. KA_TRACE(20,
  764. ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
  765. "arrived(%p) == %llu\n",
  766. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  767. team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
  768. // Wait for child to arrive
  769. kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
  770. flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  771. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  772. // Barrier imbalance - write min of the thread time and a child time to
  773. // the thread.
  774. if (__kmp_forkjoin_frames_mode == 2) {
  775. this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
  776. child_thr->th.th_bar_min_time);
  777. }
  778. #endif
  779. if (reduce) {
  780. KA_TRACE(100,
  781. ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
  782. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  783. team->t.t_id, child_tid));
  784. OMPT_REDUCTION_DECL(this_thr, gtid);
  785. OMPT_REDUCTION_BEGIN;
  786. (*reduce)(this_thr->th.th_local.reduce_data,
  787. child_thr->th.th_local.reduce_data);
  788. OMPT_REDUCTION_END;
  789. }
  790. child++;
  791. child_tid++;
  792. } while (child <= branch_factor && child_tid < nproc);
  793. }
  794. if (!KMP_MASTER_TID(tid)) { // Worker threads
  795. kmp_int32 parent_tid = (tid - 1) >> branch_bits;
  796. KA_TRACE(20,
  797. ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
  798. "arrived(%p): %llu => %llu\n",
  799. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
  800. team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
  801. thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
  802. // Mark arrival to parent thread
  803. /* After performing this write, a worker thread may not assume that the team
  804. is valid any more - it could be deallocated by the primary thread at any
  805. time. */
  806. kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
  807. flag.release();
  808. } else {
  809. // Need to update the team arrived pointer if we are the primary thread
  810. if (nproc > 1) // New value was already computed above
  811. team->t.t_bar[bt].b_arrived = new_state;
  812. else
  813. team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
  814. KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
  815. "arrived(%p) = %llu\n",
  816. gtid, team->t.t_id, tid, team->t.t_id,
  817. &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  818. }
  819. KA_TRACE(20,
  820. ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
  821. gtid, team->t.t_id, tid, bt));
  822. }
  823. static void __kmp_tree_barrier_release(
  824. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  825. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  826. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
  827. kmp_team_t *team;
  828. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  829. kmp_uint32 nproc;
  830. kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
  831. kmp_uint32 branch_factor = 1 << branch_bits;
  832. kmp_uint32 child;
  833. kmp_uint32 child_tid;
  834. // Perform a tree release for all of the threads that have been gathered
  835. if (!KMP_MASTER_TID(
  836. tid)) { // Handle fork barrier workers who aren't part of a team yet
  837. KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
  838. &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
  839. // Wait for parent thread to release us
  840. kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
  841. flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  842. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  843. if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
  844. // In fork barrier where we could not get the object reliably (or
  845. // ITTNOTIFY is disabled)
  846. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
  847. // Cancel wait on previous parallel region...
  848. __kmp_itt_task_starting(itt_sync_obj);
  849. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  850. return;
  851. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  852. if (itt_sync_obj != NULL)
  853. // Call prepare as early as possible for "new" barrier
  854. __kmp_itt_task_finished(itt_sync_obj);
  855. } else
  856. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  857. // Early exit for reaping threads releasing forkjoin barrier
  858. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  859. return;
  860. // The worker thread may now assume that the team is valid.
  861. team = __kmp_threads[gtid]->th.th_team;
  862. KMP_DEBUG_ASSERT(team != NULL);
  863. tid = __kmp_tid_from_gtid(gtid);
  864. TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
  865. KA_TRACE(20,
  866. ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
  867. team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
  868. KMP_MB(); // Flush all pending memory write invalidates.
  869. } else {
  870. team = __kmp_threads[gtid]->th.th_team;
  871. KMP_DEBUG_ASSERT(team != NULL);
  872. KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
  873. "barrier type %d\n",
  874. gtid, team->t.t_id, tid, bt));
  875. }
  876. nproc = this_thr->th.th_team_nproc;
  877. child_tid = (tid << branch_bits) + 1;
  878. if (child_tid < nproc) {
  879. kmp_info_t **other_threads = team->t.t_threads;
  880. child = 1;
  881. // Parent threads release all their children
  882. do {
  883. kmp_info_t *child_thr = other_threads[child_tid];
  884. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  885. #if KMP_CACHE_MANAGE
  886. // Prefetch next thread's go count
  887. if (child + 1 <= branch_factor && child_tid + 1 < nproc)
  888. KMP_CACHE_PREFETCH(
  889. &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
  890. #endif /* KMP_CACHE_MANAGE */
  891. #if KMP_BARRIER_ICV_PUSH
  892. {
  893. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
  894. if (propagate_icvs) {
  895. __kmp_init_implicit_task(team->t.t_ident,
  896. team->t.t_threads[child_tid], team,
  897. child_tid, FALSE);
  898. copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
  899. &team->t.t_implicit_task_taskdata[0].td_icvs);
  900. }
  901. }
  902. #endif // KMP_BARRIER_ICV_PUSH
  903. KA_TRACE(20,
  904. ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
  905. "go(%p): %u => %u\n",
  906. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  907. team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
  908. child_bar->b_go + KMP_BARRIER_STATE_BUMP));
  909. // Release child from barrier
  910. kmp_flag_64<> flag(&child_bar->b_go, child_thr);
  911. flag.release();
  912. child++;
  913. child_tid++;
  914. } while (child <= branch_factor && child_tid < nproc);
  915. }
  916. KA_TRACE(
  917. 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
  918. gtid, team->t.t_id, tid, bt));
  919. }
  920. // Hyper Barrier
  921. static void __kmp_hyper_barrier_gather(
  922. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  923. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  924. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
  925. kmp_team_t *team = this_thr->th.th_team;
  926. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  927. kmp_info_t **other_threads = team->t.t_threads;
  928. kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
  929. kmp_uint32 num_threads = this_thr->th.th_team_nproc;
  930. kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
  931. kmp_uint32 branch_factor = 1 << branch_bits;
  932. kmp_uint32 offset;
  933. kmp_uint32 level;
  934. KA_TRACE(
  935. 20,
  936. ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
  937. gtid, team->t.t_id, tid, bt));
  938. KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
  939. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  940. // Barrier imbalance - save arrive time to the thread
  941. if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
  942. this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
  943. __itt_get_timestamp();
  944. }
  945. #endif
  946. /* Perform a hypercube-embedded tree gather to wait until all of the threads
  947. have arrived, and reduce any required data as we go. */
  948. kmp_flag_64<> p_flag(&thr_bar->b_arrived);
  949. for (level = 0, offset = 1; offset < num_threads;
  950. level += branch_bits, offset <<= branch_bits) {
  951. kmp_uint32 child;
  952. kmp_uint32 child_tid;
  953. if (((tid >> level) & (branch_factor - 1)) != 0) {
  954. kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
  955. KMP_MB(); // Synchronize parent and child threads.
  956. KA_TRACE(20,
  957. ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
  958. "arrived(%p): %llu => %llu\n",
  959. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
  960. team->t.t_id, parent_tid, &thr_bar->b_arrived,
  961. thr_bar->b_arrived,
  962. thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
  963. // Mark arrival to parent thread
  964. /* After performing this write (in the last iteration of the enclosing for
  965. loop), a worker thread may not assume that the team is valid any more
  966. - it could be deallocated by the primary thread at any time. */
  967. p_flag.set_waiter(other_threads[parent_tid]);
  968. p_flag.release();
  969. break;
  970. }
  971. // Parent threads wait for children to arrive
  972. if (new_state == KMP_BARRIER_UNUSED_STATE)
  973. new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
  974. for (child = 1, child_tid = tid + (1 << level);
  975. child < branch_factor && child_tid < num_threads;
  976. child++, child_tid += (1 << level)) {
  977. kmp_info_t *child_thr = other_threads[child_tid];
  978. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  979. #if KMP_CACHE_MANAGE
  980. kmp_uint32 next_child_tid = child_tid + (1 << level);
  981. // Prefetch next thread's arrived count
  982. if (child + 1 < branch_factor && next_child_tid < num_threads)
  983. KMP_CACHE_PREFETCH(
  984. &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
  985. #endif /* KMP_CACHE_MANAGE */
  986. KA_TRACE(20,
  987. ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
  988. "arrived(%p) == %llu\n",
  989. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  990. team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
  991. // Wait for child to arrive
  992. kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
  993. c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  994. KMP_MB(); // Synchronize parent and child threads.
  995. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  996. // Barrier imbalance - write min of the thread time and a child time to
  997. // the thread.
  998. if (__kmp_forkjoin_frames_mode == 2) {
  999. this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
  1000. child_thr->th.th_bar_min_time);
  1001. }
  1002. #endif
  1003. if (reduce) {
  1004. KA_TRACE(100,
  1005. ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
  1006. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  1007. team->t.t_id, child_tid));
  1008. OMPT_REDUCTION_DECL(this_thr, gtid);
  1009. OMPT_REDUCTION_BEGIN;
  1010. (*reduce)(this_thr->th.th_local.reduce_data,
  1011. child_thr->th.th_local.reduce_data);
  1012. OMPT_REDUCTION_END;
  1013. }
  1014. }
  1015. }
  1016. if (KMP_MASTER_TID(tid)) {
  1017. // Need to update the team arrived pointer if we are the primary thread
  1018. if (new_state == KMP_BARRIER_UNUSED_STATE)
  1019. team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
  1020. else
  1021. team->t.t_bar[bt].b_arrived = new_state;
  1022. KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
  1023. "arrived(%p) = %llu\n",
  1024. gtid, team->t.t_id, tid, team->t.t_id,
  1025. &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  1026. }
  1027. KA_TRACE(
  1028. 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
  1029. gtid, team->t.t_id, tid, bt));
  1030. }
  1031. // The reverse versions seem to beat the forward versions overall
  1032. #define KMP_REVERSE_HYPER_BAR
  1033. static void __kmp_hyper_barrier_release(
  1034. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  1035. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  1036. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
  1037. kmp_team_t *team;
  1038. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  1039. kmp_info_t **other_threads;
  1040. kmp_uint32 num_threads;
  1041. kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
  1042. kmp_uint32 branch_factor = 1 << branch_bits;
  1043. kmp_uint32 child;
  1044. kmp_uint32 child_tid;
  1045. kmp_uint32 offset;
  1046. kmp_uint32 level;
  1047. /* Perform a hypercube-embedded tree release for all of the threads that have
  1048. been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
  1049. are released in the reverse order of the corresponding gather, otherwise
  1050. threads are released in the same order. */
  1051. if (KMP_MASTER_TID(tid)) { // primary thread
  1052. team = __kmp_threads[gtid]->th.th_team;
  1053. KMP_DEBUG_ASSERT(team != NULL);
  1054. KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
  1055. "barrier type %d\n",
  1056. gtid, team->t.t_id, tid, bt));
  1057. #if KMP_BARRIER_ICV_PUSH
  1058. if (propagate_icvs) { // primary already has ICVs in final destination; copy
  1059. copy_icvs(&thr_bar->th_fixed_icvs,
  1060. &team->t.t_implicit_task_taskdata[tid].td_icvs);
  1061. }
  1062. #endif
  1063. } else { // Handle fork barrier workers who aren't part of a team yet
  1064. KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
  1065. &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
  1066. // Wait for parent thread to release us
  1067. kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
  1068. flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  1069. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  1070. if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
  1071. // In fork barrier where we could not get the object reliably
  1072. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
  1073. // Cancel wait on previous parallel region...
  1074. __kmp_itt_task_starting(itt_sync_obj);
  1075. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  1076. return;
  1077. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  1078. if (itt_sync_obj != NULL)
  1079. // Call prepare as early as possible for "new" barrier
  1080. __kmp_itt_task_finished(itt_sync_obj);
  1081. } else
  1082. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  1083. // Early exit for reaping threads releasing forkjoin barrier
  1084. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  1085. return;
  1086. // The worker thread may now assume that the team is valid.
  1087. team = __kmp_threads[gtid]->th.th_team;
  1088. KMP_DEBUG_ASSERT(team != NULL);
  1089. tid = __kmp_tid_from_gtid(gtid);
  1090. TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
  1091. KA_TRACE(20,
  1092. ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
  1093. gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
  1094. KMP_MB(); // Flush all pending memory write invalidates.
  1095. }
  1096. num_threads = this_thr->th.th_team_nproc;
  1097. other_threads = team->t.t_threads;
  1098. #ifdef KMP_REVERSE_HYPER_BAR
  1099. // Count up to correct level for parent
  1100. for (level = 0, offset = 1;
  1101. offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
  1102. level += branch_bits, offset <<= branch_bits)
  1103. ;
  1104. // Now go down from there
  1105. for (level -= branch_bits, offset >>= branch_bits; offset != 0;
  1106. level -= branch_bits, offset >>= branch_bits)
  1107. #else
  1108. // Go down the tree, level by level
  1109. for (level = 0, offset = 1; offset < num_threads;
  1110. level += branch_bits, offset <<= branch_bits)
  1111. #endif // KMP_REVERSE_HYPER_BAR
  1112. {
  1113. #ifdef KMP_REVERSE_HYPER_BAR
  1114. /* Now go in reverse order through the children, highest to lowest.
  1115. Initial setting of child is conservative here. */
  1116. child = num_threads >> ((level == 0) ? level : level - 1);
  1117. for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
  1118. child_tid = tid + (child << level);
  1119. child >= 1; child--, child_tid -= (1 << level))
  1120. #else
  1121. if (((tid >> level) & (branch_factor - 1)) != 0)
  1122. // No need to go lower than this, since this is the level parent would be
  1123. // notified
  1124. break;
  1125. // Iterate through children on this level of the tree
  1126. for (child = 1, child_tid = tid + (1 << level);
  1127. child < branch_factor && child_tid < num_threads;
  1128. child++, child_tid += (1 << level))
  1129. #endif // KMP_REVERSE_HYPER_BAR
  1130. {
  1131. if (child_tid >= num_threads)
  1132. continue; // Child doesn't exist so keep going
  1133. else {
  1134. kmp_info_t *child_thr = other_threads[child_tid];
  1135. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  1136. #if KMP_CACHE_MANAGE
  1137. kmp_uint32 next_child_tid = child_tid - (1 << level);
  1138. // Prefetch next thread's go count
  1139. #ifdef KMP_REVERSE_HYPER_BAR
  1140. if (child - 1 >= 1 && next_child_tid < num_threads)
  1141. #else
  1142. if (child + 1 < branch_factor && next_child_tid < num_threads)
  1143. #endif // KMP_REVERSE_HYPER_BAR
  1144. KMP_CACHE_PREFETCH(
  1145. &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
  1146. #endif /* KMP_CACHE_MANAGE */
  1147. #if KMP_BARRIER_ICV_PUSH
  1148. if (propagate_icvs) // push my fixed ICVs to my child
  1149. copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
  1150. #endif // KMP_BARRIER_ICV_PUSH
  1151. KA_TRACE(
  1152. 20,
  1153. ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
  1154. "go(%p): %u => %u\n",
  1155. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  1156. team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
  1157. child_bar->b_go + KMP_BARRIER_STATE_BUMP));
  1158. // Release child from barrier
  1159. kmp_flag_64<> flag(&child_bar->b_go, child_thr);
  1160. flag.release();
  1161. }
  1162. }
  1163. }
  1164. #if KMP_BARRIER_ICV_PUSH
  1165. if (propagate_icvs &&
  1166. !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
  1167. __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
  1168. FALSE);
  1169. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  1170. &thr_bar->th_fixed_icvs);
  1171. }
  1172. #endif
  1173. KA_TRACE(
  1174. 20,
  1175. ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
  1176. gtid, team->t.t_id, tid, bt));
  1177. }
  1178. // Hierarchical Barrier
  1179. // Initialize thread barrier data
  1180. /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
  1181. Performs the minimum amount of initialization required based on how the team
  1182. has changed. Returns true if leaf children will require both on-core and
  1183. traditional wake-up mechanisms. For example, if the team size increases,
  1184. threads already in the team will respond to on-core wakeup on their parent
  1185. thread, but threads newly added to the team will only be listening on the
  1186. their local b_go. */
  1187. static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
  1188. kmp_bstate_t *thr_bar,
  1189. kmp_uint32 nproc, int gtid,
  1190. int tid, kmp_team_t *team) {
  1191. // Checks to determine if (re-)initialization is needed
  1192. bool uninitialized = thr_bar->team == NULL;
  1193. bool team_changed = team != thr_bar->team;
  1194. bool team_sz_changed = nproc != thr_bar->nproc;
  1195. bool tid_changed = tid != thr_bar->old_tid;
  1196. bool retval = false;
  1197. if (uninitialized || team_sz_changed) {
  1198. __kmp_get_hierarchy(nproc, thr_bar);
  1199. }
  1200. if (uninitialized || team_sz_changed || tid_changed) {
  1201. thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
  1202. thr_bar->parent_tid = -1; // default for primary thread
  1203. if (!KMP_MASTER_TID(tid)) {
  1204. // if not primary thread, find parent thread in hierarchy
  1205. kmp_uint32 d = 0;
  1206. while (d < thr_bar->depth) { // find parent based on level of thread in
  1207. // hierarchy, and note level
  1208. kmp_uint32 rem;
  1209. if (d == thr_bar->depth - 2) { // reached level right below the primary
  1210. thr_bar->parent_tid = 0;
  1211. thr_bar->my_level = d;
  1212. break;
  1213. } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
  1214. // TODO: can we make the above op faster?
  1215. // thread is not a subtree root at next level, so this is max
  1216. thr_bar->parent_tid = tid - rem;
  1217. thr_bar->my_level = d;
  1218. break;
  1219. }
  1220. ++d;
  1221. }
  1222. }
  1223. __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
  1224. (thr_bar->skip_per_level[thr_bar->my_level])),
  1225. &(thr_bar->offset));
  1226. thr_bar->old_tid = tid;
  1227. thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
  1228. thr_bar->team = team;
  1229. thr_bar->parent_bar =
  1230. &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
  1231. }
  1232. if (uninitialized || team_changed || tid_changed) {
  1233. thr_bar->team = team;
  1234. thr_bar->parent_bar =
  1235. &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
  1236. retval = true;
  1237. }
  1238. if (uninitialized || team_sz_changed || tid_changed) {
  1239. thr_bar->nproc = nproc;
  1240. thr_bar->leaf_kids = thr_bar->base_leaf_kids;
  1241. if (thr_bar->my_level == 0)
  1242. thr_bar->leaf_kids = 0;
  1243. if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
  1244. __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
  1245. thr_bar->leaf_state = 0;
  1246. for (int i = 0; i < thr_bar->leaf_kids; ++i)
  1247. ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
  1248. }
  1249. return retval;
  1250. }
  1251. static void __kmp_hierarchical_barrier_gather(
  1252. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  1253. void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  1254. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
  1255. kmp_team_t *team = this_thr->th.th_team;
  1256. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  1257. kmp_uint32 nproc = this_thr->th.th_team_nproc;
  1258. kmp_info_t **other_threads = team->t.t_threads;
  1259. kmp_uint64 new_state = 0;
  1260. int level = team->t.t_level;
  1261. if (other_threads[0]
  1262. ->th.th_teams_microtask) // are we inside the teams construct?
  1263. if (this_thr->th.th_teams_size.nteams > 1)
  1264. ++level; // level was not increased in teams construct for team_of_masters
  1265. if (level == 1)
  1266. thr_bar->use_oncore_barrier = 1;
  1267. else
  1268. thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
  1269. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
  1270. "barrier type %d\n",
  1271. gtid, team->t.t_id, tid, bt));
  1272. KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
  1273. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  1274. // Barrier imbalance - save arrive time to the thread
  1275. if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
  1276. this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
  1277. }
  1278. #endif
  1279. (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
  1280. team);
  1281. if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
  1282. kmp_int32 child_tid;
  1283. new_state =
  1284. (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
  1285. if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
  1286. thr_bar->use_oncore_barrier) {
  1287. if (thr_bar->leaf_kids) {
  1288. // First, wait for leaf children to check-in on my b_arrived flag
  1289. kmp_uint64 leaf_state =
  1290. KMP_MASTER_TID(tid)
  1291. ? thr_bar->b_arrived | thr_bar->leaf_state
  1292. : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
  1293. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
  1294. "for leaf kids\n",
  1295. gtid, team->t.t_id, tid));
  1296. kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
  1297. flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1298. if (reduce) {
  1299. OMPT_REDUCTION_DECL(this_thr, gtid);
  1300. OMPT_REDUCTION_BEGIN;
  1301. for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
  1302. ++child_tid) {
  1303. KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
  1304. "T#%d(%d:%d)\n",
  1305. gtid, team->t.t_id, tid,
  1306. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1307. child_tid));
  1308. (*reduce)(this_thr->th.th_local.reduce_data,
  1309. other_threads[child_tid]->th.th_local.reduce_data);
  1310. }
  1311. OMPT_REDUCTION_END;
  1312. }
  1313. // clear leaf_state bits
  1314. KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
  1315. }
  1316. // Next, wait for higher level children on each child's b_arrived flag
  1317. for (kmp_uint32 d = 1; d < thr_bar->my_level;
  1318. ++d) { // gather lowest level threads first, but skip 0
  1319. kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
  1320. skip = thr_bar->skip_per_level[d];
  1321. if (last > nproc)
  1322. last = nproc;
  1323. for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
  1324. kmp_info_t *child_thr = other_threads[child_tid];
  1325. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  1326. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
  1327. "T#%d(%d:%d) "
  1328. "arrived(%p) == %llu\n",
  1329. gtid, team->t.t_id, tid,
  1330. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1331. child_tid, &child_bar->b_arrived, new_state));
  1332. kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
  1333. flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1334. if (reduce) {
  1335. KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
  1336. "T#%d(%d:%d)\n",
  1337. gtid, team->t.t_id, tid,
  1338. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1339. child_tid));
  1340. (*reduce)(this_thr->th.th_local.reduce_data,
  1341. child_thr->th.th_local.reduce_data);
  1342. }
  1343. }
  1344. }
  1345. } else { // Blocktime is not infinite
  1346. for (kmp_uint32 d = 0; d < thr_bar->my_level;
  1347. ++d) { // Gather lowest level threads first
  1348. kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
  1349. skip = thr_bar->skip_per_level[d];
  1350. if (last > nproc)
  1351. last = nproc;
  1352. for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
  1353. kmp_info_t *child_thr = other_threads[child_tid];
  1354. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  1355. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
  1356. "T#%d(%d:%d) "
  1357. "arrived(%p) == %llu\n",
  1358. gtid, team->t.t_id, tid,
  1359. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1360. child_tid, &child_bar->b_arrived, new_state));
  1361. kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
  1362. flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1363. if (reduce) {
  1364. KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
  1365. "T#%d(%d:%d)\n",
  1366. gtid, team->t.t_id, tid,
  1367. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1368. child_tid));
  1369. (*reduce)(this_thr->th.th_local.reduce_data,
  1370. child_thr->th.th_local.reduce_data);
  1371. }
  1372. }
  1373. }
  1374. }
  1375. }
  1376. // All subordinates are gathered; now release parent if not primary thread
  1377. if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
  1378. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
  1379. " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
  1380. gtid, team->t.t_id, tid,
  1381. __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
  1382. thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
  1383. thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
  1384. /* Mark arrival to parent: After performing this write, a worker thread may
  1385. not assume that the team is valid any more - it could be deallocated by
  1386. the primary thread at any time. */
  1387. if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
  1388. !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
  1389. // flag; release it
  1390. kmp_flag_64<> flag(&thr_bar->b_arrived,
  1391. other_threads[thr_bar->parent_tid]);
  1392. flag.release();
  1393. } else {
  1394. // Leaf does special release on "offset" bits of parent's b_arrived flag
  1395. thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
  1396. kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
  1397. thr_bar->offset + 1);
  1398. flag.set_waiter(other_threads[thr_bar->parent_tid]);
  1399. flag.release();
  1400. }
  1401. } else { // Primary thread needs to update the team's b_arrived value
  1402. team->t.t_bar[bt].b_arrived = new_state;
  1403. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
  1404. "arrived(%p) = %llu\n",
  1405. gtid, team->t.t_id, tid, team->t.t_id,
  1406. &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  1407. }
  1408. // Is the team access below unsafe or just technically invalid?
  1409. KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
  1410. "barrier type %d\n",
  1411. gtid, team->t.t_id, tid, bt));
  1412. }
  1413. static void __kmp_hierarchical_barrier_release(
  1414. enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
  1415. int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  1416. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
  1417. kmp_team_t *team;
  1418. kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  1419. kmp_uint32 nproc;
  1420. bool team_change = false; // indicates on-core barrier shouldn't be used
  1421. if (KMP_MASTER_TID(tid)) {
  1422. team = __kmp_threads[gtid]->th.th_team;
  1423. KMP_DEBUG_ASSERT(team != NULL);
  1424. KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
  1425. "entered barrier type %d\n",
  1426. gtid, team->t.t_id, tid, bt));
  1427. } else { // Worker threads
  1428. // Wait for parent thread to release me
  1429. if (!thr_bar->use_oncore_barrier ||
  1430. __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
  1431. thr_bar->team == NULL) {
  1432. // Use traditional method of waiting on my own b_go flag
  1433. thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
  1434. kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
  1435. flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  1436. TCW_8(thr_bar->b_go,
  1437. KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
  1438. } else { // Thread barrier data is initialized, this is a leaf, blocktime is
  1439. // infinite, not nested
  1440. // Wait on my "offset" bits on parent's b_go flag
  1441. thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
  1442. kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
  1443. thr_bar->offset + 1, bt,
  1444. this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
  1445. flag.wait(this_thr, TRUE);
  1446. if (thr_bar->wait_flag ==
  1447. KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
  1448. TCW_8(thr_bar->b_go,
  1449. KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
  1450. } else { // Reset my bits on parent's b_go flag
  1451. (RCAST(volatile char *,
  1452. &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
  1453. }
  1454. }
  1455. thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
  1456. // Early exit for reaping threads releasing forkjoin barrier
  1457. if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
  1458. return;
  1459. // The worker thread may now assume that the team is valid.
  1460. team = __kmp_threads[gtid]->th.th_team;
  1461. KMP_DEBUG_ASSERT(team != NULL);
  1462. tid = __kmp_tid_from_gtid(gtid);
  1463. KA_TRACE(
  1464. 20,
  1465. ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
  1466. gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
  1467. KMP_MB(); // Flush all pending memory write invalidates.
  1468. }
  1469. nproc = this_thr->th.th_team_nproc;
  1470. int level = team->t.t_level;
  1471. if (team->t.t_threads[0]
  1472. ->th.th_teams_microtask) { // are we inside the teams construct?
  1473. if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
  1474. this_thr->th.th_teams_level == level)
  1475. ++level; // level was not increased in teams construct for team_of_workers
  1476. if (this_thr->th.th_teams_size.nteams > 1)
  1477. ++level; // level was not increased in teams construct for team_of_masters
  1478. }
  1479. if (level == 1)
  1480. thr_bar->use_oncore_barrier = 1;
  1481. else
  1482. thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
  1483. // If the team size has increased, we still communicate with old leaves via
  1484. // oncore barrier.
  1485. unsigned short int old_leaf_kids = thr_bar->leaf_kids;
  1486. kmp_uint64 old_leaf_state = thr_bar->leaf_state;
  1487. team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
  1488. tid, team);
  1489. // But if the entire team changes, we won't use oncore barrier at all
  1490. if (team_change)
  1491. old_leaf_kids = 0;
  1492. #if KMP_BARRIER_ICV_PUSH
  1493. if (propagate_icvs) {
  1494. __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
  1495. FALSE);
  1496. if (KMP_MASTER_TID(
  1497. tid)) { // primary already has copy in final destination; copy
  1498. copy_icvs(&thr_bar->th_fixed_icvs,
  1499. &team->t.t_implicit_task_taskdata[tid].td_icvs);
  1500. } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
  1501. thr_bar->use_oncore_barrier) { // optimization for inf blocktime
  1502. if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
  1503. // leaves (on-core children) pull parent's fixed ICVs directly to local
  1504. // ICV store
  1505. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  1506. &thr_bar->parent_bar->th_fixed_icvs);
  1507. // non-leaves will get ICVs piggybacked with b_go via NGO store
  1508. } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
  1509. if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
  1510. // access
  1511. copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
  1512. else // leaves copy parent's fixed ICVs directly to local ICV store
  1513. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  1514. &thr_bar->parent_bar->th_fixed_icvs);
  1515. }
  1516. }
  1517. #endif // KMP_BARRIER_ICV_PUSH
  1518. // Now, release my children
  1519. if (thr_bar->my_level) { // not a leaf
  1520. kmp_int32 child_tid;
  1521. kmp_uint32 last;
  1522. if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
  1523. thr_bar->use_oncore_barrier) {
  1524. if (KMP_MASTER_TID(tid)) { // do a flat release
  1525. // Set local b_go to bump children via NGO store of the cache line
  1526. // containing IVCs and b_go.
  1527. thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
  1528. // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
  1529. // the cache line
  1530. ngo_load(&thr_bar->th_fixed_icvs);
  1531. // This loops over all the threads skipping only the leaf nodes in the
  1532. // hierarchy
  1533. for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
  1534. child_tid += thr_bar->skip_per_level[1]) {
  1535. kmp_bstate_t *child_bar =
  1536. &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
  1537. KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
  1538. "releasing T#%d(%d:%d)"
  1539. " go(%p): %u => %u\n",
  1540. gtid, team->t.t_id, tid,
  1541. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1542. child_tid, &child_bar->b_go, child_bar->b_go,
  1543. child_bar->b_go + KMP_BARRIER_STATE_BUMP));
  1544. // Use ngo store (if available) to both store ICVs and release child
  1545. // via child's b_go
  1546. ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
  1547. }
  1548. ngo_sync();
  1549. }
  1550. TCW_8(thr_bar->b_go,
  1551. KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
  1552. // Now, release leaf children
  1553. if (thr_bar->leaf_kids) { // if there are any
  1554. // We test team_change on the off-chance that the level 1 team changed.
  1555. if (team_change ||
  1556. old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
  1557. if (old_leaf_kids) { // release old leaf kids
  1558. thr_bar->b_go |= old_leaf_state;
  1559. }
  1560. // Release new leaf kids
  1561. last = tid + thr_bar->skip_per_level[1];
  1562. if (last > nproc)
  1563. last = nproc;
  1564. for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
  1565. ++child_tid) { // skip_per_level[0]=1
  1566. kmp_info_t *child_thr = team->t.t_threads[child_tid];
  1567. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  1568. KA_TRACE(
  1569. 20,
  1570. ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
  1571. " T#%d(%d:%d) go(%p): %u => %u\n",
  1572. gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
  1573. team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
  1574. child_bar->b_go + KMP_BARRIER_STATE_BUMP));
  1575. // Release child using child's b_go flag
  1576. kmp_flag_64<> flag(&child_bar->b_go, child_thr);
  1577. flag.release();
  1578. }
  1579. } else { // Release all children at once with leaf_state bits on my own
  1580. // b_go flag
  1581. thr_bar->b_go |= thr_bar->leaf_state;
  1582. }
  1583. }
  1584. } else { // Blocktime is not infinite; do a simple hierarchical release
  1585. for (int d = thr_bar->my_level - 1; d >= 0;
  1586. --d) { // Release highest level threads first
  1587. last = tid + thr_bar->skip_per_level[d + 1];
  1588. kmp_uint32 skip = thr_bar->skip_per_level[d];
  1589. if (last > nproc)
  1590. last = nproc;
  1591. for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
  1592. kmp_info_t *child_thr = team->t.t_threads[child_tid];
  1593. kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
  1594. KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
  1595. "releasing T#%d(%d:%d) go(%p): %u => %u\n",
  1596. gtid, team->t.t_id, tid,
  1597. __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
  1598. child_tid, &child_bar->b_go, child_bar->b_go,
  1599. child_bar->b_go + KMP_BARRIER_STATE_BUMP));
  1600. // Release child using child's b_go flag
  1601. kmp_flag_64<> flag(&child_bar->b_go, child_thr);
  1602. flag.release();
  1603. }
  1604. }
  1605. }
  1606. #if KMP_BARRIER_ICV_PUSH
  1607. if (propagate_icvs && !KMP_MASTER_TID(tid))
  1608. // non-leaves copy ICVs from fixed ICVs to local dest
  1609. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  1610. &thr_bar->th_fixed_icvs);
  1611. #endif // KMP_BARRIER_ICV_PUSH
  1612. }
  1613. KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
  1614. "barrier type %d\n",
  1615. gtid, team->t.t_id, tid, bt));
  1616. }
  1617. // End of Barrier Algorithms
  1618. // type traits for cancellable value
  1619. // if cancellable is true, then is_cancellable is a normal boolean variable
  1620. // if cancellable is false, then is_cancellable is a compile time constant
  1621. template <bool cancellable> struct is_cancellable {};
  1622. template <> struct is_cancellable<true> {
  1623. bool value;
  1624. is_cancellable() : value(false) {}
  1625. is_cancellable(bool b) : value(b) {}
  1626. is_cancellable &operator=(bool b) {
  1627. value = b;
  1628. return *this;
  1629. }
  1630. operator bool() const { return value; }
  1631. };
  1632. template <> struct is_cancellable<false> {
  1633. is_cancellable &operator=(bool b) { return *this; }
  1634. constexpr operator bool() const { return false; }
  1635. };
  1636. // Internal function to do a barrier.
  1637. /* If is_split is true, do a split barrier, otherwise, do a plain barrier
  1638. If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
  1639. barrier
  1640. When cancellable = false,
  1641. Returns 0 if primary thread, 1 if worker thread.
  1642. When cancellable = true
  1643. Returns 0 if not cancelled, 1 if cancelled. */
  1644. template <bool cancellable = false>
  1645. static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
  1646. size_t reduce_size, void *reduce_data,
  1647. void (*reduce)(void *, void *)) {
  1648. KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
  1649. KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
  1650. int tid = __kmp_tid_from_gtid(gtid);
  1651. kmp_info_t *this_thr = __kmp_threads[gtid];
  1652. kmp_team_t *team = this_thr->th.th_team;
  1653. int status = 0;
  1654. is_cancellable<cancellable> cancelled;
  1655. #if OMPT_SUPPORT && OMPT_OPTIONAL
  1656. ompt_data_t *my_task_data;
  1657. ompt_data_t *my_parallel_data;
  1658. void *return_address;
  1659. ompt_sync_region_t barrier_kind;
  1660. #endif
  1661. KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
  1662. __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
  1663. #if OMPT_SUPPORT
  1664. if (ompt_enabled.enabled) {
  1665. #if OMPT_OPTIONAL
  1666. my_task_data = OMPT_CUR_TASK_DATA(this_thr);
  1667. my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
  1668. return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
  1669. barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
  1670. if (ompt_enabled.ompt_callback_sync_region) {
  1671. ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
  1672. barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
  1673. return_address);
  1674. }
  1675. if (ompt_enabled.ompt_callback_sync_region_wait) {
  1676. ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
  1677. barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
  1678. return_address);
  1679. }
  1680. #endif
  1681. // It is OK to report the barrier state after the barrier begin callback.
  1682. // According to the OMPT specification, a compliant implementation may
  1683. // even delay reporting this state until the barrier begins to wait.
  1684. this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
  1685. }
  1686. #endif
  1687. if (!team->t.t_serialized) {
  1688. #if USE_ITT_BUILD
  1689. // This value will be used in itt notify events below.
  1690. void *itt_sync_obj = NULL;
  1691. #if USE_ITT_NOTIFY
  1692. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1693. itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
  1694. #endif
  1695. #endif /* USE_ITT_BUILD */
  1696. if (__kmp_tasking_mode == tskm_extra_barrier) {
  1697. __kmp_tasking_barrier(team, this_thr, gtid);
  1698. KA_TRACE(15,
  1699. ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
  1700. __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
  1701. }
  1702. /* Copy the blocktime info to the thread, where __kmp_wait_template() can
  1703. access it when the team struct is not guaranteed to exist. */
  1704. // See note about the corresponding code in __kmp_join_barrier() being
  1705. // performance-critical.
  1706. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
  1707. #if KMP_USE_MONITOR
  1708. this_thr->th.th_team_bt_intervals =
  1709. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
  1710. this_thr->th.th_team_bt_set =
  1711. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
  1712. #else
  1713. this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
  1714. #endif
  1715. }
  1716. #if USE_ITT_BUILD
  1717. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1718. __kmp_itt_barrier_starting(gtid, itt_sync_obj);
  1719. #endif /* USE_ITT_BUILD */
  1720. #if USE_DEBUGGER
  1721. // Let the debugger know: the thread arrived to the barrier and waiting.
  1722. if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
  1723. team->t.t_bar[bt].b_master_arrived += 1;
  1724. } else {
  1725. this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
  1726. } // if
  1727. #endif /* USE_DEBUGGER */
  1728. if (reduce != NULL) {
  1729. // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
  1730. this_thr->th.th_local.reduce_data = reduce_data;
  1731. }
  1732. if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
  1733. // use 0 to only setup the current team if nthreads > 1
  1734. __kmp_task_team_setup(this_thr, team, 0);
  1735. if (cancellable) {
  1736. cancelled = __kmp_linear_barrier_gather_cancellable(
  1737. bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1738. } else {
  1739. switch (__kmp_barrier_gather_pattern[bt]) {
  1740. case bp_dist_bar: {
  1741. __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
  1742. reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1743. break;
  1744. }
  1745. case bp_hyper_bar: {
  1746. // don't set branch bits to 0; use linear
  1747. KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
  1748. __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
  1749. reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1750. break;
  1751. }
  1752. case bp_hierarchical_bar: {
  1753. __kmp_hierarchical_barrier_gather(
  1754. bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1755. break;
  1756. }
  1757. case bp_tree_bar: {
  1758. // don't set branch bits to 0; use linear
  1759. KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
  1760. __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
  1761. reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1762. break;
  1763. }
  1764. default: {
  1765. __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
  1766. reduce USE_ITT_BUILD_ARG(itt_sync_obj));
  1767. }
  1768. }
  1769. }
  1770. KMP_MB();
  1771. if (KMP_MASTER_TID(tid)) {
  1772. status = 0;
  1773. if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
  1774. __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
  1775. }
  1776. #if USE_DEBUGGER
  1777. // Let the debugger know: All threads are arrived and starting leaving the
  1778. // barrier.
  1779. team->t.t_bar[bt].b_team_arrived += 1;
  1780. #endif
  1781. if (__kmp_omp_cancellation) {
  1782. kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
  1783. // Reset cancellation flag for worksharing constructs
  1784. if (cancel_request == cancel_loop ||
  1785. cancel_request == cancel_sections) {
  1786. KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
  1787. }
  1788. }
  1789. #if USE_ITT_BUILD
  1790. /* TODO: In case of split reduction barrier, primary thread may send
  1791. acquired event early, before the final summation into the shared
  1792. variable is done (final summation can be a long operation for array
  1793. reductions). */
  1794. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1795. __kmp_itt_barrier_middle(gtid, itt_sync_obj);
  1796. #endif /* USE_ITT_BUILD */
  1797. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  1798. // Barrier - report frame end (only if active_level == 1)
  1799. if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
  1800. __kmp_forkjoin_frames_mode &&
  1801. (this_thr->th.th_teams_microtask == NULL || // either not in teams
  1802. this_thr->th.th_teams_size.nteams == 1) && // or inside single team
  1803. team->t.t_active_level == 1) {
  1804. ident_t *loc = __kmp_threads[gtid]->th.th_ident;
  1805. kmp_uint64 cur_time = __itt_get_timestamp();
  1806. kmp_info_t **other_threads = team->t.t_threads;
  1807. int nproc = this_thr->th.th_team_nproc;
  1808. int i;
  1809. switch (__kmp_forkjoin_frames_mode) {
  1810. case 1:
  1811. __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
  1812. loc, nproc);
  1813. this_thr->th.th_frame_time = cur_time;
  1814. break;
  1815. case 2: // AC 2015-01-19: currently does not work for hierarchical (to
  1816. // be fixed)
  1817. __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
  1818. 1, loc, nproc);
  1819. break;
  1820. case 3:
  1821. if (__itt_metadata_add_ptr) {
  1822. // Initialize with primary thread's wait time
  1823. kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
  1824. // Set arrive time to zero to be able to check it in
  1825. // __kmp_invoke_task(); the same is done inside the loop below
  1826. this_thr->th.th_bar_arrive_time = 0;
  1827. for (i = 1; i < nproc; ++i) {
  1828. delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
  1829. other_threads[i]->th.th_bar_arrive_time = 0;
  1830. }
  1831. __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
  1832. cur_time, delta,
  1833. (kmp_uint64)(reduce != NULL));
  1834. }
  1835. __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
  1836. loc, nproc);
  1837. this_thr->th.th_frame_time = cur_time;
  1838. break;
  1839. }
  1840. }
  1841. #endif /* USE_ITT_BUILD */
  1842. } else {
  1843. status = 1;
  1844. #if USE_ITT_BUILD
  1845. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1846. __kmp_itt_barrier_middle(gtid, itt_sync_obj);
  1847. #endif /* USE_ITT_BUILD */
  1848. }
  1849. if ((status == 1 || !is_split) && !cancelled) {
  1850. if (cancellable) {
  1851. cancelled = __kmp_linear_barrier_release_cancellable(
  1852. bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1853. } else {
  1854. switch (__kmp_barrier_release_pattern[bt]) {
  1855. case bp_dist_bar: {
  1856. KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
  1857. __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
  1858. FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1859. break;
  1860. }
  1861. case bp_hyper_bar: {
  1862. KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
  1863. __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
  1864. FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1865. break;
  1866. }
  1867. case bp_hierarchical_bar: {
  1868. __kmp_hierarchical_barrier_release(
  1869. bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1870. break;
  1871. }
  1872. case bp_tree_bar: {
  1873. KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
  1874. __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
  1875. FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1876. break;
  1877. }
  1878. default: {
  1879. __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
  1880. FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
  1881. }
  1882. }
  1883. }
  1884. if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
  1885. __kmp_task_team_sync(this_thr, team);
  1886. }
  1887. }
  1888. #if USE_ITT_BUILD
  1889. /* GEH: TODO: Move this under if-condition above and also include in
  1890. __kmp_end_split_barrier(). This will more accurately represent the actual
  1891. release time of the threads for split barriers. */
  1892. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1893. __kmp_itt_barrier_finished(gtid, itt_sync_obj);
  1894. #endif /* USE_ITT_BUILD */
  1895. } else { // Team is serialized.
  1896. status = 0;
  1897. if (__kmp_tasking_mode != tskm_immediate_exec) {
  1898. if (this_thr->th.th_task_team != NULL) {
  1899. #if USE_ITT_NOTIFY
  1900. void *itt_sync_obj = NULL;
  1901. if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
  1902. itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
  1903. __kmp_itt_barrier_starting(gtid, itt_sync_obj);
  1904. }
  1905. #endif
  1906. KMP_DEBUG_ASSERT(
  1907. this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
  1908. this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
  1909. TRUE);
  1910. __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
  1911. __kmp_task_team_setup(this_thr, team, 0);
  1912. #if USE_ITT_BUILD
  1913. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  1914. __kmp_itt_barrier_finished(gtid, itt_sync_obj);
  1915. #endif /* USE_ITT_BUILD */
  1916. }
  1917. }
  1918. }
  1919. KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
  1920. gtid, __kmp_team_from_gtid(gtid)->t.t_id,
  1921. __kmp_tid_from_gtid(gtid), status));
  1922. #if OMPT_SUPPORT
  1923. if (ompt_enabled.enabled) {
  1924. #if OMPT_OPTIONAL
  1925. if (ompt_enabled.ompt_callback_sync_region_wait) {
  1926. ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
  1927. barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
  1928. return_address);
  1929. }
  1930. if (ompt_enabled.ompt_callback_sync_region) {
  1931. ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
  1932. barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
  1933. return_address);
  1934. }
  1935. #endif
  1936. this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
  1937. }
  1938. #endif
  1939. if (cancellable)
  1940. return (int)cancelled;
  1941. return status;
  1942. }
  1943. // Returns 0 if primary thread, 1 if worker thread.
  1944. int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
  1945. size_t reduce_size, void *reduce_data,
  1946. void (*reduce)(void *, void *)) {
  1947. return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
  1948. reduce);
  1949. }
  1950. #if defined(KMP_GOMP_COMPAT)
  1951. // Returns 1 if cancelled, 0 otherwise
  1952. int __kmp_barrier_gomp_cancel(int gtid) {
  1953. if (__kmp_omp_cancellation) {
  1954. int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
  1955. 0, NULL, NULL);
  1956. if (cancelled) {
  1957. int tid = __kmp_tid_from_gtid(gtid);
  1958. kmp_info_t *this_thr = __kmp_threads[gtid];
  1959. if (KMP_MASTER_TID(tid)) {
  1960. // Primary thread does not need to revert anything
  1961. } else {
  1962. // Workers need to revert their private b_arrived flag
  1963. this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
  1964. KMP_BARRIER_STATE_BUMP;
  1965. }
  1966. }
  1967. return cancelled;
  1968. }
  1969. __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
  1970. return FALSE;
  1971. }
  1972. #endif
  1973. void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
  1974. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
  1975. KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
  1976. KMP_DEBUG_ASSERT(bt < bs_last_barrier);
  1977. int tid = __kmp_tid_from_gtid(gtid);
  1978. kmp_info_t *this_thr = __kmp_threads[gtid];
  1979. kmp_team_t *team = this_thr->th.th_team;
  1980. if (!team->t.t_serialized) {
  1981. if (KMP_MASTER_GTID(gtid)) {
  1982. switch (__kmp_barrier_release_pattern[bt]) {
  1983. case bp_dist_bar: {
  1984. __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
  1985. FALSE USE_ITT_BUILD_ARG(NULL));
  1986. break;
  1987. }
  1988. case bp_hyper_bar: {
  1989. KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
  1990. __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
  1991. FALSE USE_ITT_BUILD_ARG(NULL));
  1992. break;
  1993. }
  1994. case bp_hierarchical_bar: {
  1995. __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
  1996. FALSE USE_ITT_BUILD_ARG(NULL));
  1997. break;
  1998. }
  1999. case bp_tree_bar: {
  2000. KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
  2001. __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
  2002. FALSE USE_ITT_BUILD_ARG(NULL));
  2003. break;
  2004. }
  2005. default: {
  2006. __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
  2007. FALSE USE_ITT_BUILD_ARG(NULL));
  2008. }
  2009. }
  2010. if (__kmp_tasking_mode != tskm_immediate_exec) {
  2011. __kmp_task_team_sync(this_thr, team);
  2012. } // if
  2013. }
  2014. }
  2015. }
  2016. void __kmp_join_barrier(int gtid) {
  2017. KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
  2018. KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
  2019. KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
  2020. kmp_info_t *this_thr = __kmp_threads[gtid];
  2021. kmp_team_t *team;
  2022. int tid;
  2023. #ifdef KMP_DEBUG
  2024. int team_id;
  2025. #endif /* KMP_DEBUG */
  2026. #if USE_ITT_BUILD
  2027. void *itt_sync_obj = NULL;
  2028. #if USE_ITT_NOTIFY
  2029. if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
  2030. // Get object created at fork_barrier
  2031. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  2032. #endif
  2033. #endif /* USE_ITT_BUILD */
  2034. #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
  2035. int nproc = this_thr->th.th_team_nproc;
  2036. #endif
  2037. KMP_MB();
  2038. // Get current info
  2039. team = this_thr->th.th_team;
  2040. KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
  2041. tid = __kmp_tid_from_gtid(gtid);
  2042. #ifdef KMP_DEBUG
  2043. team_id = team->t.t_id;
  2044. kmp_info_t *master_thread = this_thr->th.th_team_master;
  2045. if (master_thread != team->t.t_threads[0]) {
  2046. __kmp_print_structure();
  2047. }
  2048. #endif /* KMP_DEBUG */
  2049. KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
  2050. KMP_MB();
  2051. // Verify state
  2052. KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
  2053. KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
  2054. KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
  2055. KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
  2056. gtid, team_id, tid));
  2057. #if OMPT_SUPPORT
  2058. if (ompt_enabled.enabled) {
  2059. #if OMPT_OPTIONAL
  2060. ompt_data_t *my_task_data;
  2061. ompt_data_t *my_parallel_data;
  2062. void *codeptr = NULL;
  2063. int ds_tid = this_thr->th.th_info.ds.ds_tid;
  2064. if (KMP_MASTER_TID(ds_tid) &&
  2065. (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
  2066. ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
  2067. codeptr = team->t.ompt_team_info.master_return_address;
  2068. my_task_data = OMPT_CUR_TASK_DATA(this_thr);
  2069. my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
  2070. if (ompt_enabled.ompt_callback_sync_region) {
  2071. ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
  2072. ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
  2073. my_task_data, codeptr);
  2074. }
  2075. if (ompt_enabled.ompt_callback_sync_region_wait) {
  2076. ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
  2077. ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
  2078. my_task_data, codeptr);
  2079. }
  2080. if (!KMP_MASTER_TID(ds_tid))
  2081. this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
  2082. #endif
  2083. this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
  2084. }
  2085. #endif
  2086. if (__kmp_tasking_mode == tskm_extra_barrier) {
  2087. __kmp_tasking_barrier(team, this_thr, gtid);
  2088. KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
  2089. gtid, team_id, tid));
  2090. }
  2091. #ifdef KMP_DEBUG
  2092. if (__kmp_tasking_mode != tskm_immediate_exec) {
  2093. KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
  2094. "%p, th_task_team = %p\n",
  2095. __kmp_gtid_from_thread(this_thr), team_id,
  2096. team->t.t_task_team[this_thr->th.th_task_state],
  2097. this_thr->th.th_task_team));
  2098. if (this_thr->th.th_task_team)
  2099. KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
  2100. team->t.t_task_team[this_thr->th.th_task_state]);
  2101. }
  2102. #endif /* KMP_DEBUG */
  2103. /* Copy the blocktime info to the thread, where __kmp_wait_template() can
  2104. access it when the team struct is not guaranteed to exist. Doing these
  2105. loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
  2106. we do not perform the copy if blocktime=infinite, since the values are not
  2107. used by __kmp_wait_template() in that case. */
  2108. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
  2109. #if KMP_USE_MONITOR
  2110. this_thr->th.th_team_bt_intervals =
  2111. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
  2112. this_thr->th.th_team_bt_set =
  2113. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
  2114. #else
  2115. this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
  2116. #endif
  2117. }
  2118. #if USE_ITT_BUILD
  2119. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  2120. __kmp_itt_barrier_starting(gtid, itt_sync_obj);
  2121. #endif /* USE_ITT_BUILD */
  2122. switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
  2123. case bp_dist_bar: {
  2124. __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
  2125. NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  2126. break;
  2127. }
  2128. case bp_hyper_bar: {
  2129. KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
  2130. __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
  2131. NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  2132. break;
  2133. }
  2134. case bp_hierarchical_bar: {
  2135. __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
  2136. NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  2137. break;
  2138. }
  2139. case bp_tree_bar: {
  2140. KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
  2141. __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
  2142. NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  2143. break;
  2144. }
  2145. default: {
  2146. __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
  2147. NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  2148. }
  2149. }
  2150. /* From this point on, the team data structure may be deallocated at any time
  2151. by the primary thread - it is unsafe to reference it in any of the worker
  2152. threads. Any per-team data items that need to be referenced before the
  2153. end of the barrier should be moved to the kmp_task_team_t structs. */
  2154. if (KMP_MASTER_TID(tid)) {
  2155. if (__kmp_tasking_mode != tskm_immediate_exec) {
  2156. __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
  2157. }
  2158. if (__kmp_display_affinity) {
  2159. KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
  2160. }
  2161. #if KMP_STATS_ENABLED
  2162. // Have primary thread flag the workers to indicate they are now waiting for
  2163. // next parallel region, Also wake them up so they switch their timers to
  2164. // idle.
  2165. for (int i = 0; i < team->t.t_nproc; ++i) {
  2166. kmp_info_t *team_thread = team->t.t_threads[i];
  2167. if (team_thread == this_thr)
  2168. continue;
  2169. team_thread->th.th_stats->setIdleFlag();
  2170. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
  2171. team_thread->th.th_sleep_loc != NULL)
  2172. __kmp_null_resume_wrapper(team_thread);
  2173. }
  2174. #endif
  2175. #if USE_ITT_BUILD
  2176. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  2177. __kmp_itt_barrier_middle(gtid, itt_sync_obj);
  2178. #endif /* USE_ITT_BUILD */
  2179. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  2180. // Join barrier - report frame end
  2181. if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
  2182. __kmp_forkjoin_frames_mode &&
  2183. (this_thr->th.th_teams_microtask == NULL || // either not in teams
  2184. this_thr->th.th_teams_size.nteams == 1) && // or inside single team
  2185. team->t.t_active_level == 1) {
  2186. kmp_uint64 cur_time = __itt_get_timestamp();
  2187. ident_t *loc = team->t.t_ident;
  2188. kmp_info_t **other_threads = team->t.t_threads;
  2189. switch (__kmp_forkjoin_frames_mode) {
  2190. case 1:
  2191. __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
  2192. loc, nproc);
  2193. break;
  2194. case 2:
  2195. __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
  2196. loc, nproc);
  2197. break;
  2198. case 3:
  2199. if (__itt_metadata_add_ptr) {
  2200. // Initialize with primary thread's wait time
  2201. kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
  2202. // Set arrive time to zero to be able to check it in
  2203. // __kmp_invoke_task(); the same is done inside the loop below
  2204. this_thr->th.th_bar_arrive_time = 0;
  2205. for (int i = 1; i < nproc; ++i) {
  2206. delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
  2207. other_threads[i]->th.th_bar_arrive_time = 0;
  2208. }
  2209. __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
  2210. cur_time, delta, 0);
  2211. }
  2212. __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
  2213. loc, nproc);
  2214. this_thr->th.th_frame_time = cur_time;
  2215. break;
  2216. }
  2217. }
  2218. #endif /* USE_ITT_BUILD */
  2219. }
  2220. #if USE_ITT_BUILD
  2221. else {
  2222. if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
  2223. __kmp_itt_barrier_middle(gtid, itt_sync_obj);
  2224. }
  2225. #endif /* USE_ITT_BUILD */
  2226. #if KMP_DEBUG
  2227. if (KMP_MASTER_TID(tid)) {
  2228. KA_TRACE(
  2229. 15,
  2230. ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
  2231. gtid, team_id, tid, nproc));
  2232. }
  2233. #endif /* KMP_DEBUG */
  2234. // TODO now, mark worker threads as done so they may be disbanded
  2235. KMP_MB(); // Flush all pending memory write invalidates.
  2236. KA_TRACE(10,
  2237. ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
  2238. }
  2239. // TODO release worker threads' fork barriers as we are ready instead of all at
  2240. // once
  2241. void __kmp_fork_barrier(int gtid, int tid) {
  2242. KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
  2243. KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
  2244. kmp_info_t *this_thr = __kmp_threads[gtid];
  2245. kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
  2246. #if USE_ITT_BUILD
  2247. void *itt_sync_obj = NULL;
  2248. #endif /* USE_ITT_BUILD */
  2249. if (team)
  2250. KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
  2251. (team != NULL) ? team->t.t_id : -1, tid));
  2252. // th_team pointer only valid for primary thread here
  2253. if (KMP_MASTER_TID(tid)) {
  2254. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  2255. if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
  2256. // Create itt barrier object
  2257. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
  2258. __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
  2259. }
  2260. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  2261. #ifdef KMP_DEBUG
  2262. KMP_DEBUG_ASSERT(team);
  2263. kmp_info_t **other_threads = team->t.t_threads;
  2264. int i;
  2265. // Verify state
  2266. KMP_MB();
  2267. for (i = 1; i < team->t.t_nproc; ++i) {
  2268. KA_TRACE(500,
  2269. ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
  2270. "== %u.\n",
  2271. gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
  2272. team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
  2273. other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
  2274. KMP_DEBUG_ASSERT(
  2275. (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
  2276. ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
  2277. KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
  2278. }
  2279. #endif
  2280. if (__kmp_tasking_mode != tskm_immediate_exec) {
  2281. // 0 indicates setup current task team if nthreads > 1
  2282. __kmp_task_team_setup(this_thr, team, 0);
  2283. }
  2284. /* The primary thread may have changed its blocktime between join barrier
  2285. and fork barrier. Copy the blocktime info to the thread, where
  2286. __kmp_wait_template() can access it when the team struct is not
  2287. guaranteed to exist. */
  2288. // See note about the corresponding code in __kmp_join_barrier() being
  2289. // performance-critical
  2290. if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
  2291. #if KMP_USE_MONITOR
  2292. this_thr->th.th_team_bt_intervals =
  2293. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
  2294. this_thr->th.th_team_bt_set =
  2295. team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
  2296. #else
  2297. this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
  2298. #endif
  2299. }
  2300. } // primary thread
  2301. switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
  2302. case bp_dist_bar: {
  2303. __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
  2304. TRUE USE_ITT_BUILD_ARG(NULL));
  2305. break;
  2306. }
  2307. case bp_hyper_bar: {
  2308. KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
  2309. __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
  2310. TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  2311. break;
  2312. }
  2313. case bp_hierarchical_bar: {
  2314. __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
  2315. TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  2316. break;
  2317. }
  2318. case bp_tree_bar: {
  2319. KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
  2320. __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
  2321. TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  2322. break;
  2323. }
  2324. default: {
  2325. __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
  2326. TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  2327. }
  2328. }
  2329. #if OMPT_SUPPORT
  2330. if (ompt_enabled.enabled &&
  2331. this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
  2332. int ds_tid = this_thr->th.th_info.ds.ds_tid;
  2333. ompt_data_t *task_data = (team)
  2334. ? OMPT_CUR_TASK_DATA(this_thr)
  2335. : &(this_thr->th.ompt_thread_info.task_data);
  2336. this_thr->th.ompt_thread_info.state = ompt_state_overhead;
  2337. #if OMPT_OPTIONAL
  2338. void *codeptr = NULL;
  2339. if (KMP_MASTER_TID(ds_tid) &&
  2340. (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
  2341. ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
  2342. codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
  2343. if (ompt_enabled.ompt_callback_sync_region_wait) {
  2344. ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
  2345. ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
  2346. codeptr);
  2347. }
  2348. if (ompt_enabled.ompt_callback_sync_region) {
  2349. ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
  2350. ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
  2351. codeptr);
  2352. }
  2353. #endif
  2354. if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
  2355. ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
  2356. ompt_scope_end, NULL, task_data, 0, ds_tid,
  2357. ompt_task_implicit); // TODO: Can this be ompt_task_initial?
  2358. }
  2359. }
  2360. #endif
  2361. // Early exit for reaping threads releasing forkjoin barrier
  2362. if (TCR_4(__kmp_global.g.g_done)) {
  2363. this_thr->th.th_task_team = NULL;
  2364. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  2365. if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
  2366. if (!KMP_MASTER_TID(tid)) {
  2367. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  2368. if (itt_sync_obj)
  2369. __kmp_itt_barrier_finished(gtid, itt_sync_obj);
  2370. }
  2371. }
  2372. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  2373. KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
  2374. return;
  2375. }
  2376. /* We can now assume that a valid team structure has been allocated by the
  2377. primary thread and propagated to all worker threads. The current thread,
  2378. however, may not be part of the team, so we can't blindly assume that the
  2379. team pointer is non-null. */
  2380. team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
  2381. KMP_DEBUG_ASSERT(team != NULL);
  2382. tid = __kmp_tid_from_gtid(gtid);
  2383. #if KMP_BARRIER_ICV_PULL
  2384. /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
  2385. __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
  2386. implicit task has this data before this function is called. We cannot
  2387. modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
  2388. thread struct, because it is not always the case that the threads arrays
  2389. have been allocated when __kmp_fork_call() is executed. */
  2390. {
  2391. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
  2392. if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
  2393. // Copy the initial ICVs from the primary thread's thread struct to the
  2394. // implicit task for this tid.
  2395. KA_TRACE(10,
  2396. ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
  2397. __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
  2398. tid, FALSE);
  2399. copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
  2400. &team->t.t_threads[0]
  2401. ->th.th_bar[bs_forkjoin_barrier]
  2402. .bb.th_fixed_icvs);
  2403. }
  2404. }
  2405. #endif // KMP_BARRIER_ICV_PULL
  2406. if (__kmp_tasking_mode != tskm_immediate_exec) {
  2407. __kmp_task_team_sync(this_thr, team);
  2408. }
  2409. #if KMP_AFFINITY_SUPPORTED
  2410. kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
  2411. if (proc_bind == proc_bind_intel) {
  2412. // Call dynamic affinity settings
  2413. if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
  2414. __kmp_balanced_affinity(this_thr, team->t.t_nproc);
  2415. }
  2416. } else if (proc_bind != proc_bind_false) {
  2417. if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
  2418. KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
  2419. __kmp_gtid_from_thread(this_thr),
  2420. this_thr->th.th_current_place));
  2421. } else {
  2422. __kmp_affinity_set_place(gtid);
  2423. }
  2424. }
  2425. #endif // KMP_AFFINITY_SUPPORTED
  2426. // Perform the display affinity functionality
  2427. if (__kmp_display_affinity) {
  2428. if (team->t.t_display_affinity
  2429. #if KMP_AFFINITY_SUPPORTED
  2430. || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
  2431. #endif
  2432. ) {
  2433. // NULL means use the affinity-format-var ICV
  2434. __kmp_aux_display_affinity(gtid, NULL);
  2435. this_thr->th.th_prev_num_threads = team->t.t_nproc;
  2436. this_thr->th.th_prev_level = team->t.t_level;
  2437. }
  2438. }
  2439. if (!KMP_MASTER_TID(tid))
  2440. KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
  2441. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  2442. if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
  2443. if (!KMP_MASTER_TID(tid)) {
  2444. // Get correct barrier object
  2445. itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
  2446. __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
  2447. } // (prepare called inside barrier_release)
  2448. }
  2449. #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  2450. KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
  2451. team->t.t_id, tid));
  2452. }
  2453. void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
  2454. kmp_internal_control_t *new_icvs, ident_t *loc) {
  2455. KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
  2456. KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
  2457. KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
  2458. /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
  2459. __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
  2460. implicit task has this data before this function is called. */
  2461. #if KMP_BARRIER_ICV_PULL
  2462. /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
  2463. remains untouched), where all of the worker threads can access them and
  2464. make their own copies after the barrier. */
  2465. KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
  2466. // allocated at this point
  2467. copy_icvs(
  2468. &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
  2469. new_icvs);
  2470. KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
  2471. team->t.t_threads[0], team));
  2472. #elif KMP_BARRIER_ICV_PUSH
  2473. // The ICVs will be propagated in the fork barrier, so nothing needs to be
  2474. // done here.
  2475. KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
  2476. team->t.t_threads[0], team));
  2477. #else
  2478. // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
  2479. // time.
  2480. ngo_load(new_icvs);
  2481. KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
  2482. // allocated at this point
  2483. for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
  2484. // TODO: GEH - pass in better source location info since usually NULL here
  2485. KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
  2486. f, team->t.t_threads[f], team));
  2487. __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
  2488. ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
  2489. KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
  2490. f, team->t.t_threads[f], team));
  2491. }
  2492. ngo_sync();
  2493. #endif // KMP_BARRIER_ICV_PULL
  2494. }