kmp_dispatch.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /*
  2. * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
  3. */
  4. //===----------------------------------------------------------------------===//
  5. //
  6. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  7. // See https://llvm.org/LICENSE.txt for license information.
  8. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  9. //
  10. //===----------------------------------------------------------------------===//
  11. #ifndef KMP_DISPATCH_H
  12. #define KMP_DISPATCH_H
  13. /* ------------------------------------------------------------------------ */
  14. /* ------------------------------------------------------------------------ */
  15. #include "kmp.h"
  16. #include "kmp_error.h"
  17. #include "kmp_i18n.h"
  18. #include "kmp_itt.h"
  19. #include "kmp_stats.h"
  20. #include "kmp_str.h"
  21. #if KMP_OS_WINDOWS && KMP_ARCH_X86
  22. #include <float.h>
  23. #endif
  24. #if OMPT_SUPPORT
  25. #include "ompt-internal.h"
  26. #include "ompt-specific.h"
  27. #endif
  28. /* ------------------------------------------------------------------------ */
  29. /* ------------------------------------------------------------------------ */
  30. #if KMP_USE_HIER_SCHED
  31. // Forward declarations of some hierarchical scheduling data structures
  32. template <typename T> struct kmp_hier_t;
  33. template <typename T> struct kmp_hier_top_unit_t;
  34. #endif // KMP_USE_HIER_SCHED
  35. template <typename T> struct dispatch_shared_info_template;
  36. template <typename T> struct dispatch_private_info_template;
  37. template <typename T>
  38. extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  39. dispatch_private_info_template<T> *pr,
  40. enum sched_type schedule, T lb, T ub,
  41. typename traits_t<T>::signed_t st,
  42. #if USE_ITT_BUILD
  43. kmp_uint64 *cur_chunk,
  44. #endif
  45. typename traits_t<T>::signed_t chunk,
  46. T nproc, T unit_id);
  47. template <typename T>
  48. extern int __kmp_dispatch_next_algorithm(
  49. int gtid, dispatch_private_info_template<T> *pr,
  50. dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
  51. T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
  52. void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  53. void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  54. #if KMP_STATIC_STEAL_ENABLED
  55. // replaces dispatch_private_info{32,64} structures and
  56. // dispatch_private_info{32,64}_t types
  57. template <typename T> struct dispatch_private_infoXX_template {
  58. typedef typename traits_t<T>::unsigned_t UT;
  59. typedef typename traits_t<T>::signed_t ST;
  60. UT count; // unsigned
  61. T ub;
  62. /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
  63. T lb;
  64. ST st; // signed
  65. UT tc; // unsigned
  66. kmp_lock_t *steal_lock; // lock used for chunk stealing
  67. /* parm[1-4] are used in different ways by different scheduling algorithms */
  68. // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
  69. // a) parm3 is properly aligned and
  70. // b) all parm1-4 are in the same cache line.
  71. // Because of parm1-4 are used together, performance seems to be better
  72. // if they are in the same line (not measured though).
  73. struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
  74. T parm1;
  75. T parm2;
  76. T parm3;
  77. T parm4;
  78. };
  79. UT ordered_lower; // unsigned
  80. UT ordered_upper; // unsigned
  81. #if KMP_OS_WINDOWS
  82. T last_upper;
  83. #endif /* KMP_OS_WINDOWS */
  84. };
  85. #else /* KMP_STATIC_STEAL_ENABLED */
  86. // replaces dispatch_private_info{32,64} structures and
  87. // dispatch_private_info{32,64}_t types
  88. template <typename T> struct dispatch_private_infoXX_template {
  89. typedef typename traits_t<T>::unsigned_t UT;
  90. typedef typename traits_t<T>::signed_t ST;
  91. T lb;
  92. T ub;
  93. ST st; // signed
  94. UT tc; // unsigned
  95. T parm1;
  96. T parm2;
  97. T parm3;
  98. T parm4;
  99. UT count; // unsigned
  100. UT ordered_lower; // unsigned
  101. UT ordered_upper; // unsigned
  102. #if KMP_OS_WINDOWS
  103. T last_upper;
  104. #endif /* KMP_OS_WINDOWS */
  105. };
  106. #endif /* KMP_STATIC_STEAL_ENABLED */
  107. template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
  108. // duplicate alignment here, otherwise size of structure is not correct in our
  109. // compiler
  110. union KMP_ALIGN_CACHE private_info_tmpl {
  111. dispatch_private_infoXX_template<T> p;
  112. dispatch_private_info64_t p64;
  113. } u;
  114. enum sched_type schedule; /* scheduling algorithm */
  115. kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
  116. std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
  117. kmp_uint32 ordered_bumped;
  118. dispatch_private_info *next; /* stack of buffers for nest of serial regions */
  119. kmp_uint32 type_size;
  120. #if KMP_USE_HIER_SCHED
  121. kmp_int32 hier_id;
  122. kmp_hier_top_unit_t<T> *hier_parent;
  123. // member functions
  124. kmp_int32 get_hier_id() const { return hier_id; }
  125. kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
  126. #endif
  127. enum cons_type pushed_ws;
  128. };
  129. // replaces dispatch_shared_info{32,64} structures and
  130. // dispatch_shared_info{32,64}_t types
  131. template <typename T> struct dispatch_shared_infoXX_template {
  132. typedef typename traits_t<T>::unsigned_t UT;
  133. typedef typename traits_t<T>::signed_t ST;
  134. /* chunk index under dynamic, number of idle threads under static-steal;
  135. iteration index otherwise */
  136. volatile UT iteration;
  137. volatile ST num_done;
  138. volatile UT ordered_iteration;
  139. // to retain the structure size making ordered_iteration scalar
  140. UT ordered_dummy[KMP_MAX_ORDERED - 3];
  141. };
  142. // replaces dispatch_shared_info structure and dispatch_shared_info_t type
  143. template <typename T> struct dispatch_shared_info_template {
  144. typedef typename traits_t<T>::unsigned_t UT;
  145. // we need union here to keep the structure size
  146. union shared_info_tmpl {
  147. dispatch_shared_infoXX_template<UT> s;
  148. dispatch_shared_info64_t s64;
  149. } u;
  150. volatile kmp_uint32 buffer_index;
  151. volatile kmp_int32 doacross_buf_idx; // teamwise index
  152. kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
  153. kmp_int32 doacross_num_done; // count finished threads
  154. #if KMP_USE_HIER_SCHED
  155. kmp_hier_t<T> *hier;
  156. #endif
  157. #if KMP_USE_HWLOC
  158. // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
  159. // machines (> 48 cores). Performance analysis showed that a cache thrash
  160. // was occurring and this padding helps alleviate the problem.
  161. char padding[64];
  162. #endif
  163. };
  164. /* ------------------------------------------------------------------------ */
  165. /* ------------------------------------------------------------------------ */
  166. #undef USE_TEST_LOCKS
  167. // test_then_add template (general template should NOT be used)
  168. template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
  169. template <>
  170. __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
  171. kmp_int32 d) {
  172. kmp_int32 r;
  173. r = KMP_TEST_THEN_ADD32(p, d);
  174. return r;
  175. }
  176. template <>
  177. __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
  178. kmp_int64 d) {
  179. kmp_int64 r;
  180. r = KMP_TEST_THEN_ADD64(p, d);
  181. return r;
  182. }
  183. // test_then_inc_acq template (general template should NOT be used)
  184. template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
  185. template <>
  186. __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
  187. kmp_int32 r;
  188. r = KMP_TEST_THEN_INC_ACQ32(p);
  189. return r;
  190. }
  191. template <>
  192. __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
  193. kmp_int64 r;
  194. r = KMP_TEST_THEN_INC_ACQ64(p);
  195. return r;
  196. }
  197. // test_then_inc template (general template should NOT be used)
  198. template <typename T> static __forceinline T test_then_inc(volatile T *p);
  199. template <>
  200. __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
  201. kmp_int32 r;
  202. r = KMP_TEST_THEN_INC32(p);
  203. return r;
  204. }
  205. template <>
  206. __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
  207. kmp_int64 r;
  208. r = KMP_TEST_THEN_INC64(p);
  209. return r;
  210. }
  211. // compare_and_swap template (general template should NOT be used)
  212. template <typename T>
  213. static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
  214. template <>
  215. __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
  216. kmp_int32 c, kmp_int32 s) {
  217. return KMP_COMPARE_AND_STORE_REL32(p, c, s);
  218. }
  219. template <>
  220. __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
  221. kmp_int64 c, kmp_int64 s) {
  222. return KMP_COMPARE_AND_STORE_REL64(p, c, s);
  223. }
  224. template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
  225. return value >= checker;
  226. }
  227. template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
  228. return value == checker;
  229. }
  230. /*
  231. Spin wait loop that pauses between checks.
  232. Waits until function returns non-zero when called with *spinner and check.
  233. Does NOT put threads to sleep.
  234. Arguments:
  235. UT is unsigned 4- or 8-byte type
  236. spinner - memory location to check value
  237. checker - value which spinner is >, <, ==, etc.
  238. pred - predicate function to perform binary comparison of some sort
  239. #if USE_ITT_BUILD
  240. obj -- is higher-level synchronization object to report to ittnotify. It
  241. is used to report locks consistently. For example, if lock is acquired
  242. immediately, its address is reported to ittnotify via
  243. KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
  244. and lock routine calls to KMP_WAIT(), the later should report the
  245. same address, not an address of low-level spinner.
  246. #endif // USE_ITT_BUILD
  247. TODO: make inline function (move to header file for icl)
  248. */
  249. template <typename UT>
  250. static UT __kmp_wait(volatile UT *spinner, UT checker,
  251. kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
  252. // note: we may not belong to a team at this point
  253. volatile UT *spin = spinner;
  254. UT check = checker;
  255. kmp_uint32 spins;
  256. kmp_uint32 (*f)(UT, UT) = pred;
  257. kmp_uint64 time;
  258. UT r;
  259. KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
  260. KMP_INIT_YIELD(spins);
  261. KMP_INIT_BACKOFF(time);
  262. // main wait spin loop
  263. while (!f(r = *spin, check)) {
  264. KMP_FSYNC_SPIN_PREPARE(obj);
  265. /* GEH - remove this since it was accidentally introduced when kmp_wait was
  266. split.
  267. It causes problems with infinite recursion because of exit lock */
  268. /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
  269. __kmp_abort_thread(); */
  270. // If oversubscribed, or have waited a bit then yield.
  271. KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
  272. }
  273. KMP_FSYNC_SPIN_ACQUIRED(obj);
  274. return r;
  275. }
  276. /* ------------------------------------------------------------------------ */
  277. /* ------------------------------------------------------------------------ */
  278. template <typename UT>
  279. void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  280. dispatch_private_info_template<UT> *pr;
  281. int gtid = *gtid_ref;
  282. // int cid = *cid_ref;
  283. kmp_info_t *th = __kmp_threads[gtid];
  284. KMP_DEBUG_ASSERT(th->th.th_dispatch);
  285. KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
  286. if (__kmp_env_consistency_check) {
  287. pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
  288. th->th.th_dispatch->th_dispatch_pr_current);
  289. if (pr->pushed_ws != ct_none) {
  290. #if KMP_USE_DYNAMIC_LOCK
  291. __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
  292. #else
  293. __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
  294. #endif
  295. }
  296. }
  297. if (!th->th.th_team->t.t_serialized) {
  298. dispatch_shared_info_template<UT> *sh =
  299. reinterpret_cast<dispatch_shared_info_template<UT> *>(
  300. th->th.th_dispatch->th_dispatch_sh_current);
  301. UT lower;
  302. if (!__kmp_env_consistency_check) {
  303. pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
  304. th->th.th_dispatch->th_dispatch_pr_current);
  305. }
  306. lower = pr->u.p.ordered_lower;
  307. #if !defined(KMP_GOMP_COMPAT)
  308. if (__kmp_env_consistency_check) {
  309. if (pr->ordered_bumped) {
  310. struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
  311. __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
  312. ct_ordered_in_pdo, loc_ref,
  313. &p->stack_data[p->w_top]);
  314. }
  315. }
  316. #endif /* !defined(KMP_GOMP_COMPAT) */
  317. KMP_MB();
  318. #ifdef KMP_DEBUG
  319. {
  320. char *buff;
  321. // create format specifiers before the debug output
  322. buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
  323. "ordered_iter:%%%s lower:%%%s\n",
  324. traits_t<UT>::spec, traits_t<UT>::spec);
  325. KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
  326. __kmp_str_free(&buff);
  327. }
  328. #endif
  329. __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
  330. __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
  331. KMP_MB(); /* is this necessary? */
  332. #ifdef KMP_DEBUG
  333. {
  334. char *buff;
  335. // create format specifiers before the debug output
  336. buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
  337. "ordered_iter:%%%s lower:%%%s\n",
  338. traits_t<UT>::spec, traits_t<UT>::spec);
  339. KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
  340. __kmp_str_free(&buff);
  341. }
  342. #endif
  343. }
  344. KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
  345. }
  346. template <typename UT>
  347. void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  348. typedef typename traits_t<UT>::signed_t ST;
  349. dispatch_private_info_template<UT> *pr;
  350. int gtid = *gtid_ref;
  351. // int cid = *cid_ref;
  352. kmp_info_t *th = __kmp_threads[gtid];
  353. KMP_DEBUG_ASSERT(th->th.th_dispatch);
  354. KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
  355. if (__kmp_env_consistency_check) {
  356. pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
  357. th->th.th_dispatch->th_dispatch_pr_current);
  358. if (pr->pushed_ws != ct_none) {
  359. __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
  360. }
  361. }
  362. if (!th->th.th_team->t.t_serialized) {
  363. dispatch_shared_info_template<UT> *sh =
  364. reinterpret_cast<dispatch_shared_info_template<UT> *>(
  365. th->th.th_dispatch->th_dispatch_sh_current);
  366. if (!__kmp_env_consistency_check) {
  367. pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
  368. th->th.th_dispatch->th_dispatch_pr_current);
  369. }
  370. KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
  371. #if !defined(KMP_GOMP_COMPAT)
  372. if (__kmp_env_consistency_check) {
  373. if (pr->ordered_bumped != 0) {
  374. struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
  375. /* How to test it? - OM */
  376. __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
  377. ct_ordered_in_pdo, loc_ref,
  378. &p->stack_data[p->w_top]);
  379. }
  380. }
  381. #endif /* !defined(KMP_GOMP_COMPAT) */
  382. KMP_MB(); /* Flush all pending memory write invalidates. */
  383. pr->ordered_bumped += 1;
  384. KD_TRACE(1000,
  385. ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
  386. gtid, pr->ordered_bumped));
  387. KMP_MB(); /* Flush all pending memory write invalidates. */
  388. /* TODO use general release procedure? */
  389. test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
  390. KMP_MB(); /* Flush all pending memory write invalidates. */
  391. }
  392. KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
  393. }
  394. /* Computes and returns x to the power of y, where y must a non-negative integer
  395. */
  396. template <typename UT>
  397. static __forceinline long double __kmp_pow(long double x, UT y) {
  398. long double s = 1.0L;
  399. KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
  400. // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
  401. while (y) {
  402. if (y & 1)
  403. s *= x;
  404. x *= x;
  405. y >>= 1;
  406. }
  407. return s;
  408. }
  409. /* Computes and returns the number of unassigned iterations after idx chunks
  410. have been assigned
  411. (the total number of unassigned iterations in chunks with index greater than
  412. or equal to idx).
  413. __forceinline seems to be broken so that if we __forceinline this function,
  414. the behavior is wrong
  415. (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
  416. */
  417. template <typename T>
  418. static __inline typename traits_t<T>::unsigned_t
  419. __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
  420. typename traits_t<T>::unsigned_t idx) {
  421. /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
  422. least for ICL 8.1, long double arithmetic may not really have
  423. long double precision, even with /Qlong_double. Currently, we
  424. workaround that in the caller code, by manipulating the FPCW for
  425. Windows* OS on IA-32 architecture. The lack of precision is not
  426. expected to be a correctness issue, though.
  427. */
  428. typedef typename traits_t<T>::unsigned_t UT;
  429. long double x = tc * __kmp_pow<UT>(base, idx);
  430. UT r = (UT)x;
  431. if (x == r)
  432. return r;
  433. return r + 1;
  434. }
  435. // Parameters of the guided-iterative algorithm:
  436. // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
  437. // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
  438. // by default n = 2. For example with n = 3 the chunks distribution will be more
  439. // flat.
  440. // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
  441. static const int guided_int_param = 2;
  442. static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
  443. #endif // KMP_DISPATCH_H