kmp_taskdeps.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. /*
  2. * kmp_taskdeps.cpp
  3. */
  4. //===----------------------------------------------------------------------===//
  5. //
  6. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  7. // See https://llvm.org/LICENSE.txt for license information.
  8. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  9. //
  10. //===----------------------------------------------------------------------===//
  11. //#define KMP_SUPPORT_GRAPH_OUTPUT 1
  12. #include "kmp.h"
  13. #include "kmp_io.h"
  14. #include "kmp_wait_release.h"
  15. #include "kmp_taskdeps.h"
  16. #if OMPT_SUPPORT
  17. #include "ompt-specific.h"
  18. #endif
  19. // TODO: Improve memory allocation? keep a list of pre-allocated structures?
  20. // allocate in blocks? re-use list finished list entries?
  21. // TODO: don't use atomic ref counters for stack-allocated nodes.
  22. // TODO: find an alternate to atomic refs for heap-allocated nodes?
  23. // TODO: Finish graph output support
  24. // TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
  25. // runtime locks
  26. // TODO: Any ITT support needed?
  27. #ifdef KMP_SUPPORT_GRAPH_OUTPUT
  28. static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
  29. #endif
  30. static void __kmp_init_node(kmp_depnode_t *node) {
  31. node->dn.successors = NULL;
  32. node->dn.task = NULL; // will point to the right task
  33. // once dependences have been processed
  34. for (int i = 0; i < MAX_MTX_DEPS; ++i)
  35. node->dn.mtx_locks[i] = NULL;
  36. node->dn.mtx_num_locks = 0;
  37. __kmp_init_lock(&node->dn.lock);
  38. KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
  39. #ifdef KMP_SUPPORT_GRAPH_OUTPUT
  40. node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
  41. #endif
  42. #if USE_ITT_BUILD && USE_ITT_NOTIFY
  43. __itt_sync_create(node, "OMP task dep node", NULL, 0);
  44. #endif
  45. }
  46. static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
  47. KMP_ATOMIC_INC(&node->dn.nrefs);
  48. return node;
  49. }
  50. enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
  51. size_t sizes[] = {997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029};
  52. const size_t MAX_GEN = 8;
  53. static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
  54. // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
  55. // m_num_sets );
  56. return ((addr >> 6) ^ (addr >> 2)) % hsize;
  57. }
  58. static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
  59. kmp_dephash_t *current_dephash) {
  60. kmp_dephash_t *h;
  61. size_t gen = current_dephash->generation + 1;
  62. if (gen >= MAX_GEN)
  63. return current_dephash;
  64. size_t new_size = sizes[gen];
  65. size_t size_to_allocate =
  66. new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
  67. #if USE_FAST_MEMORY
  68. h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size_to_allocate);
  69. #else
  70. h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size_to_allocate);
  71. #endif
  72. h->size = new_size;
  73. h->nelements = current_dephash->nelements;
  74. h->buckets = (kmp_dephash_entry **)(h + 1);
  75. h->generation = gen;
  76. h->nconflicts = 0;
  77. h->last_all = current_dephash->last_all;
  78. // make sure buckets are properly initialized
  79. for (size_t i = 0; i < new_size; i++) {
  80. h->buckets[i] = NULL;
  81. }
  82. // insert existing elements in the new table
  83. for (size_t i = 0; i < current_dephash->size; i++) {
  84. kmp_dephash_entry_t *next, *entry;
  85. for (entry = current_dephash->buckets[i]; entry; entry = next) {
  86. next = entry->next_in_bucket;
  87. // Compute the new hash using the new size, and insert the entry in
  88. // the new bucket.
  89. size_t new_bucket = __kmp_dephash_hash(entry->addr, h->size);
  90. entry->next_in_bucket = h->buckets[new_bucket];
  91. if (entry->next_in_bucket) {
  92. h->nconflicts++;
  93. }
  94. h->buckets[new_bucket] = entry;
  95. }
  96. }
  97. // Free old hash table
  98. #if USE_FAST_MEMORY
  99. __kmp_fast_free(thread, current_dephash);
  100. #else
  101. __kmp_thread_free(thread, current_dephash);
  102. #endif
  103. return h;
  104. }
  105. static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
  106. kmp_taskdata_t *current_task) {
  107. kmp_dephash_t *h;
  108. size_t h_size;
  109. if (current_task->td_flags.tasktype == TASK_IMPLICIT)
  110. h_size = KMP_DEPHASH_MASTER_SIZE;
  111. else
  112. h_size = KMP_DEPHASH_OTHER_SIZE;
  113. size_t size = h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
  114. #if USE_FAST_MEMORY
  115. h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
  116. #else
  117. h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
  118. #endif
  119. h->size = h_size;
  120. h->generation = 0;
  121. h->nelements = 0;
  122. h->nconflicts = 0;
  123. h->buckets = (kmp_dephash_entry **)(h + 1);
  124. h->last_all = NULL;
  125. for (size_t i = 0; i < h_size; i++)
  126. h->buckets[i] = 0;
  127. return h;
  128. }
  129. static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
  130. kmp_dephash_t **hash,
  131. kmp_intptr_t addr) {
  132. kmp_dephash_t *h = *hash;
  133. if (h->nelements != 0 && h->nconflicts / h->size >= 1) {
  134. *hash = __kmp_dephash_extend(thread, h);
  135. h = *hash;
  136. }
  137. size_t bucket = __kmp_dephash_hash(addr, h->size);
  138. kmp_dephash_entry_t *entry;
  139. for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
  140. if (entry->addr == addr)
  141. break;
  142. if (entry == NULL) {
  143. // create entry. This is only done by one thread so no locking required
  144. #if USE_FAST_MEMORY
  145. entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
  146. thread, sizeof(kmp_dephash_entry_t));
  147. #else
  148. entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
  149. thread, sizeof(kmp_dephash_entry_t));
  150. #endif
  151. entry->addr = addr;
  152. if (!h->last_all) // no predecessor task with omp_all_memory dependence
  153. entry->last_out = NULL;
  154. else // else link the omp_all_memory depnode to the new entry
  155. entry->last_out = __kmp_node_ref(h->last_all);
  156. entry->last_set = NULL;
  157. entry->prev_set = NULL;
  158. entry->last_flag = 0;
  159. entry->mtx_lock = NULL;
  160. entry->next_in_bucket = h->buckets[bucket];
  161. h->buckets[bucket] = entry;
  162. h->nelements++;
  163. if (entry->next_in_bucket)
  164. h->nconflicts++;
  165. }
  166. return entry;
  167. }
  168. static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
  169. kmp_depnode_list_t *list,
  170. kmp_depnode_t *node) {
  171. kmp_depnode_list_t *new_head;
  172. #if USE_FAST_MEMORY
  173. new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
  174. thread, sizeof(kmp_depnode_list_t));
  175. #else
  176. new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
  177. thread, sizeof(kmp_depnode_list_t));
  178. #endif
  179. new_head->node = __kmp_node_ref(node);
  180. new_head->next = list;
  181. return new_head;
  182. }
  183. static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
  184. kmp_depnode_t *sink,
  185. kmp_task_t *sink_task) {
  186. #ifdef KMP_SUPPORT_GRAPH_OUTPUT
  187. kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
  188. // do not use sink->dn.task as that is only filled after the dependences
  189. // are already processed!
  190. kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
  191. __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
  192. task_source->td_ident->psource, sink->dn.id,
  193. task_sink->td_ident->psource);
  194. #endif
  195. #if OMPT_SUPPORT && OMPT_OPTIONAL
  196. /* OMPT tracks dependences between task (a=source, b=sink) in which
  197. task a blocks the execution of b through the ompt_new_dependence_callback
  198. */
  199. if (ompt_enabled.ompt_callback_task_dependence) {
  200. kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
  201. ompt_data_t *sink_data;
  202. if (sink_task)
  203. sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
  204. else
  205. sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
  206. ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
  207. &(task_source->ompt_task_info.task_data), sink_data);
  208. }
  209. #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
  210. }
  211. static inline kmp_int32
  212. __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
  213. kmp_task_t *task, kmp_depnode_t *node,
  214. kmp_depnode_list_t *plist) {
  215. if (!plist)
  216. return 0;
  217. kmp_int32 npredecessors = 0;
  218. // link node as successor of list elements
  219. for (kmp_depnode_list_t *p = plist; p; p = p->next) {
  220. kmp_depnode_t *dep = p->node;
  221. if (dep->dn.task) {
  222. KMP_ACQUIRE_DEPNODE(gtid, dep);
  223. if (dep->dn.task) {
  224. __kmp_track_dependence(gtid, dep, node, task);
  225. dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
  226. KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
  227. "%p\n",
  228. gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
  229. KMP_TASK_TO_TASKDATA(task)));
  230. npredecessors++;
  231. }
  232. KMP_RELEASE_DEPNODE(gtid, dep);
  233. }
  234. }
  235. return npredecessors;
  236. }
  237. static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
  238. kmp_info_t *thread,
  239. kmp_task_t *task,
  240. kmp_depnode_t *source,
  241. kmp_depnode_t *sink) {
  242. if (!sink)
  243. return 0;
  244. kmp_int32 npredecessors = 0;
  245. if (sink->dn.task) {
  246. // synchronously add source to sink' list of successors
  247. KMP_ACQUIRE_DEPNODE(gtid, sink);
  248. if (sink->dn.task) {
  249. __kmp_track_dependence(gtid, sink, source, task);
  250. sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
  251. KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
  252. "%p\n",
  253. gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
  254. KMP_TASK_TO_TASKDATA(task)));
  255. npredecessors++;
  256. }
  257. KMP_RELEASE_DEPNODE(gtid, sink);
  258. }
  259. return npredecessors;
  260. }
  261. static inline kmp_int32
  262. __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
  263. bool dep_barrier, kmp_task_t *task) {
  264. KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
  265. "dep_barrier = %d\n",
  266. gtid, dep_barrier));
  267. kmp_info_t *thread = __kmp_threads[gtid];
  268. kmp_int32 npredecessors = 0;
  269. // process previous omp_all_memory node if any
  270. npredecessors +=
  271. __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
  272. __kmp_node_deref(thread, h->last_all);
  273. if (!dep_barrier) {
  274. h->last_all = __kmp_node_ref(node);
  275. } else {
  276. // if this is a sync point in the serial sequence, then the previous
  277. // outputs are guaranteed to be completed after the execution of this
  278. // task so the previous output nodes can be cleared.
  279. h->last_all = NULL;
  280. }
  281. // process all regular dependences
  282. for (size_t i = 0; i < h->size; i++) {
  283. kmp_dephash_entry_t *info = h->buckets[i];
  284. if (!info) // skip empty slots in dephash
  285. continue;
  286. for (; info; info = info->next_in_bucket) {
  287. // for each entry the omp_all_memory works as OUT dependence
  288. kmp_depnode_t *last_out = info->last_out;
  289. kmp_depnode_list_t *last_set = info->last_set;
  290. kmp_depnode_list_t *prev_set = info->prev_set;
  291. if (last_set) {
  292. npredecessors +=
  293. __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
  294. __kmp_depnode_list_free(thread, last_set);
  295. __kmp_depnode_list_free(thread, prev_set);
  296. info->last_set = NULL;
  297. info->prev_set = NULL;
  298. info->last_flag = 0; // no sets in this dephash entry
  299. } else {
  300. npredecessors +=
  301. __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
  302. }
  303. __kmp_node_deref(thread, last_out);
  304. if (!dep_barrier) {
  305. info->last_out = __kmp_node_ref(node);
  306. } else {
  307. info->last_out = NULL;
  308. }
  309. }
  310. }
  311. KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
  312. npredecessors));
  313. return npredecessors;
  314. }
  315. template <bool filter>
  316. static inline kmp_int32
  317. __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
  318. bool dep_barrier, kmp_int32 ndeps,
  319. kmp_depend_info_t *dep_list, kmp_task_t *task) {
  320. KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependences : "
  321. "dep_barrier = %d\n",
  322. filter, gtid, ndeps, dep_barrier));
  323. kmp_info_t *thread = __kmp_threads[gtid];
  324. kmp_int32 npredecessors = 0;
  325. for (kmp_int32 i = 0; i < ndeps; i++) {
  326. const kmp_depend_info_t *dep = &dep_list[i];
  327. if (filter && dep->base_addr == 0)
  328. continue; // skip filtered entries
  329. kmp_dephash_entry_t *info =
  330. __kmp_dephash_find(thread, hash, dep->base_addr);
  331. kmp_depnode_t *last_out = info->last_out;
  332. kmp_depnode_list_t *last_set = info->last_set;
  333. kmp_depnode_list_t *prev_set = info->prev_set;
  334. if (dep->flags.out) { // out or inout --> clean lists if any
  335. if (last_set) {
  336. npredecessors +=
  337. __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
  338. __kmp_depnode_list_free(thread, last_set);
  339. __kmp_depnode_list_free(thread, prev_set);
  340. info->last_set = NULL;
  341. info->prev_set = NULL;
  342. info->last_flag = 0; // no sets in this dephash entry
  343. } else {
  344. npredecessors +=
  345. __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
  346. }
  347. __kmp_node_deref(thread, last_out);
  348. if (!dep_barrier) {
  349. info->last_out = __kmp_node_ref(node);
  350. } else {
  351. // if this is a sync point in the serial sequence, then the previous
  352. // outputs are guaranteed to be completed after the execution of this
  353. // task so the previous output nodes can be cleared.
  354. info->last_out = NULL;
  355. }
  356. } else { // either IN or MTX or SET
  357. if (info->last_flag == 0 || info->last_flag == dep->flag) {
  358. // last_set either didn't exist or of same dep kind
  359. // link node as successor of the last_out if any
  360. npredecessors +=
  361. __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
  362. // link node as successor of all nodes in the prev_set if any
  363. npredecessors +=
  364. __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
  365. if (dep_barrier) {
  366. // clean last_out and prev_set if any; don't touch last_set
  367. __kmp_node_deref(thread, last_out);
  368. info->last_out = NULL;
  369. __kmp_depnode_list_free(thread, prev_set);
  370. info->prev_set = NULL;
  371. }
  372. } else { // last_set is of different dep kind, make it prev_set
  373. // link node as successor of all nodes in the last_set
  374. npredecessors +=
  375. __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
  376. // clean last_out if any
  377. __kmp_node_deref(thread, last_out);
  378. info->last_out = NULL;
  379. // clean prev_set if any
  380. __kmp_depnode_list_free(thread, prev_set);
  381. if (!dep_barrier) {
  382. // move last_set to prev_set, new last_set will be allocated
  383. info->prev_set = last_set;
  384. } else {
  385. info->prev_set = NULL;
  386. info->last_flag = 0;
  387. }
  388. info->last_set = NULL;
  389. }
  390. // for dep_barrier last_flag value should remain:
  391. // 0 if last_set is empty, unchanged otherwise
  392. if (!dep_barrier) {
  393. info->last_flag = dep->flag; // store dep kind of the last_set
  394. info->last_set = __kmp_add_node(thread, info->last_set, node);
  395. }
  396. // check if we are processing MTX dependency
  397. if (dep->flag == KMP_DEP_MTX) {
  398. if (info->mtx_lock == NULL) {
  399. info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
  400. __kmp_init_lock(info->mtx_lock);
  401. }
  402. KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
  403. kmp_int32 m;
  404. // Save lock in node's array
  405. for (m = 0; m < MAX_MTX_DEPS; ++m) {
  406. // sort pointers in decreasing order to avoid potential livelock
  407. if (node->dn.mtx_locks[m] < info->mtx_lock) {
  408. KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
  409. for (int n = node->dn.mtx_num_locks; n > m; --n) {
  410. // shift right all lesser non-NULL pointers
  411. KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
  412. node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
  413. }
  414. node->dn.mtx_locks[m] = info->mtx_lock;
  415. break;
  416. }
  417. }
  418. KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
  419. node->dn.mtx_num_locks++;
  420. }
  421. }
  422. }
  423. KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
  424. gtid, npredecessors));
  425. return npredecessors;
  426. }
  427. #define NO_DEP_BARRIER (false)
  428. #define DEP_BARRIER (true)
  429. // returns true if the task has any outstanding dependence
  430. static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
  431. kmp_task_t *task, kmp_dephash_t **hash,
  432. bool dep_barrier, kmp_int32 ndeps,
  433. kmp_depend_info_t *dep_list,
  434. kmp_int32 ndeps_noalias,
  435. kmp_depend_info_t *noalias_dep_list) {
  436. int i, n_mtxs = 0, dep_all = 0;
  437. #if KMP_DEBUG
  438. kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
  439. #endif
  440. KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
  441. "possibly aliased dependences, %d non-aliased dependences : "
  442. "dep_barrier=%d .\n",
  443. gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
  444. // Filter deps in dep_list
  445. // TODO: Different algorithm for large dep_list ( > 10 ? )
  446. for (i = 0; i < ndeps; i++) {
  447. if (dep_list[i].base_addr != 0 &&
  448. dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
  449. KMP_DEBUG_ASSERT(
  450. dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
  451. dep_list[i].flag == KMP_DEP_INOUT ||
  452. dep_list[i].flag == KMP_DEP_MTX || dep_list[i].flag == KMP_DEP_SET);
  453. for (int j = i + 1; j < ndeps; j++) {
  454. if (dep_list[i].base_addr == dep_list[j].base_addr) {
  455. if (dep_list[i].flag != dep_list[j].flag) {
  456. // two different dependences on same address work identical to OUT
  457. dep_list[i].flag = KMP_DEP_OUT;
  458. }
  459. dep_list[j].base_addr = 0; // Mark j element as void
  460. }
  461. }
  462. if (dep_list[i].flag == KMP_DEP_MTX) {
  463. // limit number of mtx deps to MAX_MTX_DEPS per node
  464. if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
  465. ++n_mtxs;
  466. } else {
  467. dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
  468. }
  469. }
  470. } else if (dep_list[i].flag == KMP_DEP_ALL ||
  471. dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) {
  472. // omp_all_memory dependence can be marked by compiler by either
  473. // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
  474. // omp_all_memory overrides all other dependences if any
  475. dep_all = 1;
  476. break;
  477. }
  478. }
  479. // doesn't need to be atomic as no other thread is going to be accessing this
  480. // node just yet.
  481. // npredecessors is set -1 to ensure that none of the releasing tasks queues
  482. // this task before we have finished processing all the dependences
  483. node->dn.npredecessors = -1;
  484. // used to pack all npredecessors additions into a single atomic operation at
  485. // the end
  486. int npredecessors;
  487. if (!dep_all) { // regular dependences
  488. npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
  489. ndeps, dep_list, task);
  490. npredecessors += __kmp_process_deps<false>(
  491. gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
  492. } else { // omp_all_memory dependence
  493. npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
  494. }
  495. node->dn.task = task;
  496. KMP_MB();
  497. // Account for our initial fake value
  498. npredecessors++;
  499. // Update predecessors and obtain current value to check if there are still
  500. // any outstanding dependences (some tasks may have finished while we
  501. // processed the dependences)
  502. npredecessors =
  503. node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
  504. KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
  505. gtid, npredecessors, taskdata));
  506. // beyond this point the task could be queued (and executed) by a releasing
  507. // task...
  508. return npredecessors > 0 ? true : false;
  509. }
  510. /*!
  511. @ingroup TASKING
  512. @param loc_ref location of the original task directive
  513. @param gtid Global Thread ID of encountering thread
  514. @param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
  515. task''
  516. @param ndeps Number of depend items with possible aliasing
  517. @param dep_list List of depend items with possible aliasing
  518. @param ndeps_noalias Number of depend items with no aliasing
  519. @param noalias_dep_list List of depend items with no aliasing
  520. @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
  521. suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
  522. Schedule a non-thread-switchable task with dependences for execution
  523. */
  524. kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
  525. kmp_task_t *new_task, kmp_int32 ndeps,
  526. kmp_depend_info_t *dep_list,
  527. kmp_int32 ndeps_noalias,
  528. kmp_depend_info_t *noalias_dep_list) {
  529. kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
  530. KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
  531. loc_ref, new_taskdata));
  532. __kmp_assert_valid_gtid(gtid);
  533. kmp_info_t *thread = __kmp_threads[gtid];
  534. kmp_taskdata_t *current_task = thread->th.th_current_task;
  535. #if OMPT_SUPPORT
  536. if (ompt_enabled.enabled) {
  537. if (!current_task->ompt_task_info.frame.enter_frame.ptr)
  538. current_task->ompt_task_info.frame.enter_frame.ptr =
  539. OMPT_GET_FRAME_ADDRESS(0);
  540. if (ompt_enabled.ompt_callback_task_create) {
  541. ompt_callbacks.ompt_callback(ompt_callback_task_create)(
  542. &(current_task->ompt_task_info.task_data),
  543. &(current_task->ompt_task_info.frame),
  544. &(new_taskdata->ompt_task_info.task_data),
  545. ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
  546. OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
  547. }
  548. new_taskdata->ompt_task_info.frame.enter_frame.ptr =
  549. OMPT_GET_FRAME_ADDRESS(0);
  550. }
  551. #if OMPT_OPTIONAL
  552. /* OMPT grab all dependences if requested by the tool */
  553. if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
  554. kmp_int32 i;
  555. int ompt_ndeps = ndeps + ndeps_noalias;
  556. ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
  557. thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
  558. KMP_ASSERT(ompt_deps != NULL);
  559. for (i = 0; i < ndeps; i++) {
  560. ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
  561. if (dep_list[i].flags.in && dep_list[i].flags.out)
  562. ompt_deps[i].dependence_type = ompt_dependence_type_inout;
  563. else if (dep_list[i].flags.out)
  564. ompt_deps[i].dependence_type = ompt_dependence_type_out;
  565. else if (dep_list[i].flags.in)
  566. ompt_deps[i].dependence_type = ompt_dependence_type_in;
  567. else if (dep_list[i].flags.mtx)
  568. ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
  569. else if (dep_list[i].flags.set)
  570. ompt_deps[i].dependence_type = ompt_dependence_type_inoutset;
  571. }
  572. for (i = 0; i < ndeps_noalias; i++) {
  573. ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
  574. if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
  575. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
  576. else if (noalias_dep_list[i].flags.out)
  577. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
  578. else if (noalias_dep_list[i].flags.in)
  579. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
  580. else if (noalias_dep_list[i].flags.mtx)
  581. ompt_deps[ndeps + i].dependence_type =
  582. ompt_dependence_type_mutexinoutset;
  583. else if (noalias_dep_list[i].flags.set)
  584. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
  585. }
  586. ompt_callbacks.ompt_callback(ompt_callback_dependences)(
  587. &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
  588. /* We can now free the allocated memory for the dependences */
  589. /* For OMPD we might want to delay the free until end of this function */
  590. KMP_OMPT_DEPS_FREE(thread, ompt_deps);
  591. }
  592. #endif /* OMPT_OPTIONAL */
  593. #endif /* OMPT_SUPPORT */
  594. bool serial = current_task->td_flags.team_serial ||
  595. current_task->td_flags.tasking_ser ||
  596. current_task->td_flags.final;
  597. kmp_task_team_t *task_team = thread->th.th_task_team;
  598. serial = serial &&
  599. !(task_team && (task_team->tt.tt_found_proxy_tasks ||
  600. task_team->tt.tt_hidden_helper_task_encountered));
  601. if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
  602. /* if no dependences have been tracked yet, create the dependence hash */
  603. if (current_task->td_dephash == NULL)
  604. current_task->td_dephash = __kmp_dephash_create(thread, current_task);
  605. #if USE_FAST_MEMORY
  606. kmp_depnode_t *node =
  607. (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
  608. #else
  609. kmp_depnode_t *node =
  610. (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
  611. #endif
  612. __kmp_init_node(node);
  613. new_taskdata->td_depnode = node;
  614. if (__kmp_check_deps(gtid, node, new_task, &current_task->td_dephash,
  615. NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
  616. noalias_dep_list)) {
  617. KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
  618. "dependences: "
  619. "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
  620. gtid, loc_ref, new_taskdata));
  621. #if OMPT_SUPPORT
  622. if (ompt_enabled.enabled) {
  623. current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
  624. }
  625. #endif
  626. return TASK_CURRENT_NOT_QUEUED;
  627. }
  628. } else {
  629. KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependences "
  630. "for task (serialized) loc=%p task=%p\n",
  631. gtid, loc_ref, new_taskdata));
  632. }
  633. KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
  634. "dependences : "
  635. "loc=%p task=%p, transferring to __kmp_omp_task\n",
  636. gtid, loc_ref, new_taskdata));
  637. kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
  638. #if OMPT_SUPPORT
  639. if (ompt_enabled.enabled) {
  640. current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
  641. }
  642. #endif
  643. return ret;
  644. }
  645. #if OMPT_SUPPORT
  646. void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
  647. ompt_data_t *taskwait_task_data) {
  648. if (ompt_enabled.ompt_callback_task_schedule) {
  649. ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
  650. taskwait_task_data, ompt_taskwait_complete, NULL);
  651. }
  652. current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
  653. *taskwait_task_data = ompt_data_none;
  654. }
  655. #endif /* OMPT_SUPPORT */
  656. /*!
  657. @ingroup TASKING
  658. @param loc_ref location of the original task directive
  659. @param gtid Global Thread ID of encountering thread
  660. @param ndeps Number of depend items with possible aliasing
  661. @param dep_list List of depend items with possible aliasing
  662. @param ndeps_noalias Number of depend items with no aliasing
  663. @param noalias_dep_list List of depend items with no aliasing
  664. Blocks the current task until all specifies dependences have been fulfilled.
  665. */
  666. void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
  667. kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
  668. kmp_depend_info_t *noalias_dep_list) {
  669. KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
  670. if (ndeps == 0 && ndeps_noalias == 0) {
  671. KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependences to "
  672. "wait upon : loc=%p\n",
  673. gtid, loc_ref));
  674. return;
  675. }
  676. __kmp_assert_valid_gtid(gtid);
  677. kmp_info_t *thread = __kmp_threads[gtid];
  678. kmp_taskdata_t *current_task = thread->th.th_current_task;
  679. #if OMPT_SUPPORT
  680. // this function represents a taskwait construct with depend clause
  681. // We signal 4 events:
  682. // - creation of the taskwait task
  683. // - dependences of the taskwait task
  684. // - schedule and finish of the taskwait task
  685. ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
  686. KMP_ASSERT(taskwait_task_data->ptr == NULL);
  687. if (ompt_enabled.enabled) {
  688. if (!current_task->ompt_task_info.frame.enter_frame.ptr)
  689. current_task->ompt_task_info.frame.enter_frame.ptr =
  690. OMPT_GET_FRAME_ADDRESS(0);
  691. if (ompt_enabled.ompt_callback_task_create) {
  692. ompt_callbacks.ompt_callback(ompt_callback_task_create)(
  693. &(current_task->ompt_task_info.task_data),
  694. &(current_task->ompt_task_info.frame), taskwait_task_data,
  695. ompt_task_taskwait | ompt_task_undeferred | ompt_task_mergeable, 1,
  696. OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
  697. }
  698. }
  699. #if OMPT_OPTIONAL
  700. /* OMPT grab all dependences if requested by the tool */
  701. if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
  702. kmp_int32 i;
  703. int ompt_ndeps = ndeps + ndeps_noalias;
  704. ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
  705. thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
  706. KMP_ASSERT(ompt_deps != NULL);
  707. for (i = 0; i < ndeps; i++) {
  708. ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
  709. if (dep_list[i].flags.in && dep_list[i].flags.out)
  710. ompt_deps[i].dependence_type = ompt_dependence_type_inout;
  711. else if (dep_list[i].flags.out)
  712. ompt_deps[i].dependence_type = ompt_dependence_type_out;
  713. else if (dep_list[i].flags.in)
  714. ompt_deps[i].dependence_type = ompt_dependence_type_in;
  715. else if (dep_list[i].flags.mtx)
  716. ompt_deps[ndeps + i].dependence_type =
  717. ompt_dependence_type_mutexinoutset;
  718. else if (dep_list[i].flags.set)
  719. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
  720. }
  721. for (i = 0; i < ndeps_noalias; i++) {
  722. ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
  723. if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
  724. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
  725. else if (noalias_dep_list[i].flags.out)
  726. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
  727. else if (noalias_dep_list[i].flags.in)
  728. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
  729. else if (noalias_dep_list[i].flags.mtx)
  730. ompt_deps[ndeps + i].dependence_type =
  731. ompt_dependence_type_mutexinoutset;
  732. else if (noalias_dep_list[i].flags.set)
  733. ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
  734. }
  735. ompt_callbacks.ompt_callback(ompt_callback_dependences)(
  736. taskwait_task_data, ompt_deps, ompt_ndeps);
  737. /* We can now free the allocated memory for the dependences */
  738. /* For OMPD we might want to delay the free until end of this function */
  739. KMP_OMPT_DEPS_FREE(thread, ompt_deps);
  740. ompt_deps = NULL;
  741. }
  742. #endif /* OMPT_OPTIONAL */
  743. #endif /* OMPT_SUPPORT */
  744. // We can return immediately as:
  745. // - dependences are not computed in serial teams (except with proxy tasks)
  746. // - if the dephash is not yet created it means we have nothing to wait for
  747. bool ignore = current_task->td_flags.team_serial ||
  748. current_task->td_flags.tasking_ser ||
  749. current_task->td_flags.final;
  750. ignore =
  751. ignore && thread->th.th_task_team != NULL &&
  752. thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE &&
  753. thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE;
  754. ignore = ignore || current_task->td_dephash == NULL;
  755. if (ignore) {
  756. KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
  757. "dependences : loc=%p\n",
  758. gtid, loc_ref));
  759. #if OMPT_SUPPORT
  760. __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
  761. #endif /* OMPT_SUPPORT */
  762. return;
  763. }
  764. kmp_depnode_t node = {0};
  765. __kmp_init_node(&node);
  766. if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
  767. DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
  768. noalias_dep_list)) {
  769. KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
  770. "dependences : loc=%p\n",
  771. gtid, loc_ref));
  772. #if OMPT_SUPPORT
  773. __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
  774. #endif /* OMPT_SUPPORT */
  775. return;
  776. }
  777. int thread_finished = FALSE;
  778. kmp_flag_32<false, false> flag(
  779. (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
  780. while (node.dn.npredecessors > 0) {
  781. flag.execute_tasks(thread, gtid, FALSE,
  782. &thread_finished USE_ITT_BUILD_ARG(NULL),
  783. __kmp_task_stealing_constraint);
  784. }
  785. #if OMPT_SUPPORT
  786. __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
  787. #endif /* OMPT_SUPPORT */
  788. KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
  789. gtid, loc_ref));
  790. }