kmp_barrier.h 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. /*
  2. * kmp_barrier.h
  3. */
  4. //===----------------------------------------------------------------------===//
  5. //
  6. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  7. // See https://llvm.org/LICENSE.txt for license information.
  8. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  9. //
  10. //===----------------------------------------------------------------------===//
  11. #ifndef KMP_BARRIER_H
  12. #define KMP_BARRIER_H
  13. #include "kmp.h"
  14. #include "kmp_i18n.h"
  15. #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
  16. #include <xmmintrin.h>
  17. #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
  18. #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
  19. #elif KMP_HAVE_ALIGNED_ALLOC
  20. #define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
  21. #define KMP_ALIGNED_FREE(ptr) free(ptr)
  22. #elif KMP_HAVE_POSIX_MEMALIGN
  23. static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
  24. void *ptr;
  25. int n = posix_memalign(&ptr, alignment, size);
  26. if (n != 0) {
  27. if (ptr)
  28. free(ptr);
  29. return nullptr;
  30. }
  31. return ptr;
  32. }
  33. #define KMP_ALIGNED_FREE(ptr) free(ptr)
  34. #elif KMP_HAVE__ALIGNED_MALLOC
  35. #include <malloc.h>
  36. #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
  37. #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
  38. #else
  39. #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
  40. #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
  41. #endif
  42. // Use four cache lines: MLC tends to prefetch the next or previous cache line
  43. // creating a possible fake conflict between cores, so this is the only way to
  44. // guarantee that no such prefetch can happen.
  45. #ifndef KMP_FOURLINE_ALIGN_CACHE
  46. #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
  47. #endif
  48. #define KMP_OPTIMIZE_FOR_REDUCTIONS 0
  49. class distributedBarrier {
  50. struct flags_s {
  51. kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
  52. };
  53. struct go_s {
  54. std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
  55. };
  56. struct iter_s {
  57. kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
  58. };
  59. struct sleep_s {
  60. std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
  61. };
  62. void init(size_t nthr);
  63. void resize(size_t nthr);
  64. void computeGo(size_t n);
  65. void computeVarsForN(size_t n);
  66. public:
  67. enum {
  68. MAX_ITERS = 3,
  69. MAX_GOS = 8,
  70. IDEAL_GOS = 4,
  71. IDEAL_CONTENTION = 16,
  72. };
  73. flags_s *flags[MAX_ITERS];
  74. go_s *go;
  75. iter_s *iter;
  76. sleep_s *sleep;
  77. size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
  78. size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
  79. // number of go signals each requiring one write per iteration
  80. size_t KMP_ALIGN_CACHE num_gos;
  81. // number of groups of gos
  82. size_t KMP_ALIGN_CACHE num_groups;
  83. // threads per go signal
  84. size_t KMP_ALIGN_CACHE threads_per_go;
  85. bool KMP_ALIGN_CACHE fix_threads_per_go;
  86. // threads per group
  87. size_t KMP_ALIGN_CACHE threads_per_group;
  88. // number of go signals in a group
  89. size_t KMP_ALIGN_CACHE gos_per_group;
  90. void *team_icvs;
  91. distributedBarrier() = delete;
  92. ~distributedBarrier() = delete;
  93. // Used instead of constructor to create aligned data
  94. static distributedBarrier *allocate(int nThreads) {
  95. distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
  96. sizeof(distributedBarrier), 4 * CACHE_LINE);
  97. if (!d) {
  98. KMP_FATAL(MemoryAllocFailed);
  99. }
  100. d->num_threads = 0;
  101. d->max_threads = 0;
  102. for (int i = 0; i < MAX_ITERS; ++i)
  103. d->flags[i] = NULL;
  104. d->go = NULL;
  105. d->iter = NULL;
  106. d->sleep = NULL;
  107. d->team_icvs = NULL;
  108. d->fix_threads_per_go = false;
  109. // calculate gos and groups ONCE on base size
  110. d->computeGo(nThreads);
  111. d->init(nThreads);
  112. return d;
  113. }
  114. static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
  115. void update_num_threads(size_t nthr) { init(nthr); }
  116. bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
  117. size_t get_num_threads() { return num_threads; }
  118. kmp_uint64 go_release();
  119. void go_reset();
  120. };
  121. #endif // KMP_BARRIER_H