LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23  class Mask : public KMPAffinity::Mask {
24  hwloc_cpuset_t mask;
25 
26  public:
27  Mask() {
28  mask = hwloc_bitmap_alloc();
29  this->zero();
30  }
31  ~Mask() { hwloc_bitmap_free(mask); }
32  void set(int i) override { hwloc_bitmap_set(mask, i); }
33  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35  void zero() override { hwloc_bitmap_zero(mask); }
36  void copy(const KMPAffinity::Mask *src) override {
37  const Mask *convert = static_cast<const Mask *>(src);
38  hwloc_bitmap_copy(mask, convert->mask);
39  }
40  void bitwise_and(const KMPAffinity::Mask *rhs) override {
41  const Mask *convert = static_cast<const Mask *>(rhs);
42  hwloc_bitmap_and(mask, mask, convert->mask);
43  }
44  void bitwise_or(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_or(mask, mask, convert->mask);
47  }
48  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49  int begin() const override { return hwloc_bitmap_first(mask); }
50  int end() const override { return -1; }
51  int next(int previous) const override {
52  return hwloc_bitmap_next(mask, previous);
53  }
54  int get_system_affinity(bool abort_on_error) override {
55  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56  "Illegal get affinity operation when not capable");
57  long retval =
58  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59  if (retval >= 0) {
60  return 0;
61  }
62  int error = errno;
63  if (abort_on_error) {
64  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65  }
66  return error;
67  }
68  int set_system_affinity(bool abort_on_error) const override {
69  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70  "Illegal set affinity operation when not capable");
71  long retval =
72  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73  if (retval >= 0) {
74  return 0;
75  }
76  int error = errno;
77  if (abort_on_error) {
78  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79  }
80  return error;
81  }
82 #if KMP_OS_WINDOWS
83  int set_process_affinity(bool abort_on_error) const override {
84  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85  "Illegal set process affinity operation when not capable");
86  int error = 0;
87  const hwloc_topology_support *support =
88  hwloc_topology_get_support(__kmp_hwloc_topology);
89  if (support->cpubind->set_proc_cpubind) {
90  int retval;
91  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92  HWLOC_CPUBIND_PROCESS);
93  if (retval >= 0)
94  return 0;
95  error = errno;
96  if (abort_on_error)
97  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98  }
99  return error;
100  }
101 #endif
102  int get_proc_group() const override {
103  int group = -1;
104 #if KMP_OS_WINDOWS
105  if (__kmp_num_proc_groups == 1) {
106  return 1;
107  }
108  for (int i = 0; i < __kmp_num_proc_groups; i++) {
109  // On windows, the long type is always 32 bits
110  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111  unsigned long second_32_bits =
112  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113  if (first_32_bits == 0 && second_32_bits == 0) {
114  continue;
115  }
116  if (group >= 0) {
117  return -1;
118  }
119  group = i;
120  }
121 #endif /* KMP_OS_WINDOWS */
122  return group;
123  }
124  };
125  void determine_capable(const char *var) override {
126  const hwloc_topology_support *topology_support;
127  if (__kmp_hwloc_topology == NULL) {
128  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129  __kmp_hwloc_error = TRUE;
130  if (__kmp_affinity_verbose)
131  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132  }
133  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134  __kmp_hwloc_error = TRUE;
135  if (__kmp_affinity_verbose)
136  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137  }
138  }
139  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140  // Is the system capable of setting/getting this thread's affinity?
141  // Also, is topology discovery possible? (pu indicates ability to discover
142  // processing units). And finally, were there no errors when calling any
143  // hwloc_* API functions?
144  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145  topology_support->cpubind->get_thisthread_cpubind &&
146  topology_support->discovery->pu && !__kmp_hwloc_error) {
147  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148  KMP_AFFINITY_ENABLE(TRUE);
149  } else {
150  // indicate that hwloc didn't work and disable affinity
151  __kmp_hwloc_error = TRUE;
152  KMP_AFFINITY_DISABLE();
153  }
154  }
155  void bind_thread(int which) override {
156  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157  "Illegal set affinity operation when not capable");
158  KMPAffinity::Mask *mask;
159  KMP_CPU_ALLOC_ON_STACK(mask);
160  KMP_CPU_ZERO(mask);
161  KMP_CPU_SET(which, mask);
162  __kmp_set_system_affinity(mask, TRUE);
163  KMP_CPU_FREE_FROM_STACK(mask);
164  }
165  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167  KMPAffinity::Mask *allocate_mask_array(int num) override {
168  return new Mask[num];
169  }
170  void deallocate_mask_array(KMPAffinity::Mask *array) override {
171  Mask *hwloc_array = static_cast<Mask *>(array);
172  delete[] hwloc_array;
173  }
174  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175  int index) override {
176  Mask *hwloc_array = static_cast<Mask *>(array);
177  return &(hwloc_array[index]);
178  }
179  api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187  all systems of the same arch where they are defined, and they cannot change.
188  stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_X86_64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 203
215 #elif __NR_sched_setaffinity != 203
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 204
220 #elif __NR_sched_getaffinity != 204
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_PPC64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 222
226 #elif __NR_sched_setaffinity != 222
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 223
231 #elif __NR_sched_getaffinity != 223
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 # elif KMP_ARCH_MIPS
235 # ifndef __NR_sched_setaffinity
236 # define __NR_sched_setaffinity 4239
237 # elif __NR_sched_setaffinity != 4239
238 # error Wrong code for setaffinity system call.
239 # endif /* __NR_sched_setaffinity */
240 # ifndef __NR_sched_getaffinity
241 # define __NR_sched_getaffinity 4240
242 # elif __NR_sched_getaffinity != 4240
243 # error Wrong code for getaffinity system call.
244 # endif /* __NR_sched_getaffinity */
245 # elif KMP_ARCH_MIPS64
246 # ifndef __NR_sched_setaffinity
247 # define __NR_sched_setaffinity 5195
248 # elif __NR_sched_setaffinity != 5195
249 # error Wrong code for setaffinity system call.
250 # endif /* __NR_sched_setaffinity */
251 # ifndef __NR_sched_getaffinity
252 # define __NR_sched_getaffinity 5196
253 # elif __NR_sched_getaffinity != 5196
254 # error Wrong code for getaffinity system call.
255 # endif /* __NR_sched_getaffinity */
256 # else
257 #error Unknown or unsupported architecture
258 #endif /* KMP_ARCH_* */
259 #elif KMP_OS_FREEBSD
260 #include <pthread.h>
261 #include <pthread_np.h>
262 #endif
263 class KMPNativeAffinity : public KMPAffinity {
264  class Mask : public KMPAffinity::Mask {
265  typedef unsigned long mask_t;
266  typedef decltype(__kmp_affin_mask_size) mask_size_type;
267  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
268  static const mask_t ONE = 1;
269  mask_size_type get_num_mask_types() const {
270  return __kmp_affin_mask_size / sizeof(mask_t);
271  }
272 
273  public:
274  mask_t *mask;
275  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
276  ~Mask() {
277  if (mask)
278  __kmp_free(mask);
279  }
280  void set(int i) override {
281  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
282  }
283  bool is_set(int i) const override {
284  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
285  }
286  void clear(int i) override {
287  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
288  }
289  void zero() override {
290  mask_size_type e = get_num_mask_types();
291  for (mask_size_type i = 0; i < e; ++i)
292  mask[i] = (mask_t)0;
293  }
294  void copy(const KMPAffinity::Mask *src) override {
295  const Mask *convert = static_cast<const Mask *>(src);
296  mask_size_type e = get_num_mask_types();
297  for (mask_size_type i = 0; i < e; ++i)
298  mask[i] = convert->mask[i];
299  }
300  void bitwise_and(const KMPAffinity::Mask *rhs) override {
301  const Mask *convert = static_cast<const Mask *>(rhs);
302  mask_size_type e = get_num_mask_types();
303  for (mask_size_type i = 0; i < e; ++i)
304  mask[i] &= convert->mask[i];
305  }
306  void bitwise_or(const KMPAffinity::Mask *rhs) override {
307  const Mask *convert = static_cast<const Mask *>(rhs);
308  mask_size_type e = get_num_mask_types();
309  for (mask_size_type i = 0; i < e; ++i)
310  mask[i] |= convert->mask[i];
311  }
312  void bitwise_not() override {
313  mask_size_type e = get_num_mask_types();
314  for (mask_size_type i = 0; i < e; ++i)
315  mask[i] = ~(mask[i]);
316  }
317  int begin() const override {
318  int retval = 0;
319  while (retval < end() && !is_set(retval))
320  ++retval;
321  return retval;
322  }
323  int end() const override {
324  int e;
325  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
326  return e;
327  }
328  int next(int previous) const override {
329  int retval = previous + 1;
330  while (retval < end() && !is_set(retval))
331  ++retval;
332  return retval;
333  }
334  int get_system_affinity(bool abort_on_error) override {
335  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
336  "Illegal get affinity operation when not capable");
337 #if KMP_OS_LINUX
338  long retval =
339  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
340 #elif KMP_OS_FREEBSD
341  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
342  reinterpret_cast<cpuset_t *>(mask));
343  int retval = (r == 0 ? 0 : -1);
344 #endif
345  if (retval >= 0) {
346  return 0;
347  }
348  int error = errno;
349  if (abort_on_error) {
350  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
351  }
352  return error;
353  }
354  int set_system_affinity(bool abort_on_error) const override {
355  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
356  "Illegal set affinity operation when not capable");
357 #if KMP_OS_LINUX
358  long retval =
359  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
360 #elif KMP_OS_FREEBSD
361  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
362  reinterpret_cast<cpuset_t *>(mask));
363  int retval = (r == 0 ? 0 : -1);
364 #endif
365  if (retval >= 0) {
366  return 0;
367  }
368  int error = errno;
369  if (abort_on_error) {
370  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
371  }
372  return error;
373  }
374  };
375  void determine_capable(const char *env_var) override {
376  __kmp_affinity_determine_capable(env_var);
377  }
378  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
379  KMPAffinity::Mask *allocate_mask() override {
380  KMPNativeAffinity::Mask *retval = new Mask();
381  return retval;
382  }
383  void deallocate_mask(KMPAffinity::Mask *m) override {
384  KMPNativeAffinity::Mask *native_mask =
385  static_cast<KMPNativeAffinity::Mask *>(m);
386  delete native_mask;
387  }
388  KMPAffinity::Mask *allocate_mask_array(int num) override {
389  return new Mask[num];
390  }
391  void deallocate_mask_array(KMPAffinity::Mask *array) override {
392  Mask *linux_array = static_cast<Mask *>(array);
393  delete[] linux_array;
394  }
395  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
396  int index) override {
397  Mask *linux_array = static_cast<Mask *>(array);
398  return &(linux_array[index]);
399  }
400  api_type get_api_type() const override { return NATIVE_OS; }
401 };
402 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
403 
404 #if KMP_OS_WINDOWS
405 class KMPNativeAffinity : public KMPAffinity {
406  class Mask : public KMPAffinity::Mask {
407  typedef ULONG_PTR mask_t;
408  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
409  mask_t *mask;
410 
411  public:
412  Mask() {
413  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
414  }
415  ~Mask() {
416  if (mask)
417  __kmp_free(mask);
418  }
419  void set(int i) override {
420  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
421  }
422  bool is_set(int i) const override {
423  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
424  }
425  void clear(int i) override {
426  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
427  }
428  void zero() override {
429  for (int i = 0; i < __kmp_num_proc_groups; ++i)
430  mask[i] = 0;
431  }
432  void copy(const KMPAffinity::Mask *src) override {
433  const Mask *convert = static_cast<const Mask *>(src);
434  for (int i = 0; i < __kmp_num_proc_groups; ++i)
435  mask[i] = convert->mask[i];
436  }
437  void bitwise_and(const KMPAffinity::Mask *rhs) override {
438  const Mask *convert = static_cast<const Mask *>(rhs);
439  for (int i = 0; i < __kmp_num_proc_groups; ++i)
440  mask[i] &= convert->mask[i];
441  }
442  void bitwise_or(const KMPAffinity::Mask *rhs) override {
443  const Mask *convert = static_cast<const Mask *>(rhs);
444  for (int i = 0; i < __kmp_num_proc_groups; ++i)
445  mask[i] |= convert->mask[i];
446  }
447  void bitwise_not() override {
448  for (int i = 0; i < __kmp_num_proc_groups; ++i)
449  mask[i] = ~(mask[i]);
450  }
451  int begin() const override {
452  int retval = 0;
453  while (retval < end() && !is_set(retval))
454  ++retval;
455  return retval;
456  }
457  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
458  int next(int previous) const override {
459  int retval = previous + 1;
460  while (retval < end() && !is_set(retval))
461  ++retval;
462  return retval;
463  }
464  int set_process_affinity(bool abort_on_error) const override {
465  if (__kmp_num_proc_groups <= 1) {
466  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
467  DWORD error = GetLastError();
468  if (abort_on_error) {
469  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
470  __kmp_msg_null);
471  }
472  return error;
473  }
474  }
475  return 0;
476  }
477  int set_system_affinity(bool abort_on_error) const override {
478  if (__kmp_num_proc_groups > 1) {
479  // Check for a valid mask.
480  GROUP_AFFINITY ga;
481  int group = get_proc_group();
482  if (group < 0) {
483  if (abort_on_error) {
484  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
485  }
486  return -1;
487  }
488  // Transform the bit vector into a GROUP_AFFINITY struct
489  // and make the system call to set affinity.
490  ga.Group = group;
491  ga.Mask = mask[group];
492  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
493 
494  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
495  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
496  DWORD error = GetLastError();
497  if (abort_on_error) {
498  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
499  __kmp_msg_null);
500  }
501  return error;
502  }
503  } else {
504  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
505  DWORD error = GetLastError();
506  if (abort_on_error) {
507  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
508  __kmp_msg_null);
509  }
510  return error;
511  }
512  }
513  return 0;
514  }
515  int get_system_affinity(bool abort_on_error) override {
516  if (__kmp_num_proc_groups > 1) {
517  this->zero();
518  GROUP_AFFINITY ga;
519  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
520  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
521  DWORD error = GetLastError();
522  if (abort_on_error) {
523  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
524  KMP_ERR(error), __kmp_msg_null);
525  }
526  return error;
527  }
528  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
529  (ga.Mask == 0)) {
530  return -1;
531  }
532  mask[ga.Group] = ga.Mask;
533  } else {
534  mask_t newMask, sysMask, retval;
535  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
536  DWORD error = GetLastError();
537  if (abort_on_error) {
538  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
539  KMP_ERR(error), __kmp_msg_null);
540  }
541  return error;
542  }
543  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
544  if (!retval) {
545  DWORD error = GetLastError();
546  if (abort_on_error) {
547  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
548  KMP_ERR(error), __kmp_msg_null);
549  }
550  return error;
551  }
552  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
553  if (!newMask) {
554  DWORD error = GetLastError();
555  if (abort_on_error) {
556  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
557  KMP_ERR(error), __kmp_msg_null);
558  }
559  }
560  *mask = retval;
561  }
562  return 0;
563  }
564  int get_proc_group() const override {
565  int group = -1;
566  if (__kmp_num_proc_groups == 1) {
567  return 1;
568  }
569  for (int i = 0; i < __kmp_num_proc_groups; i++) {
570  if (mask[i] == 0)
571  continue;
572  if (group >= 0)
573  return -1;
574  group = i;
575  }
576  return group;
577  }
578  };
579  void determine_capable(const char *env_var) override {
580  __kmp_affinity_determine_capable(env_var);
581  }
582  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
583  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
584  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
585  KMPAffinity::Mask *allocate_mask_array(int num) override {
586  return new Mask[num];
587  }
588  void deallocate_mask_array(KMPAffinity::Mask *array) override {
589  Mask *windows_array = static_cast<Mask *>(array);
590  delete[] windows_array;
591  }
592  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
593  int index) override {
594  Mask *windows_array = static_cast<Mask *>(array);
595  return &(windows_array[index]);
596  }
597  api_type get_api_type() const override { return NATIVE_OS; }
598 };
599 #endif /* KMP_OS_WINDOWS */
600 #endif /* KMP_AFFINITY_SUPPORTED */
601 
602 class kmp_hw_thread_t {
603 public:
604  static const int UNKNOWN_ID = -1;
605  static int compare_ids(const void *a, const void *b);
606  static int compare_compact(const void *a, const void *b);
607  int ids[KMP_HW_LAST];
608  int sub_ids[KMP_HW_LAST];
609  bool leader;
610  int os_id;
611  void print() const;
612  void clear() {
613  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
614  ids[i] = UNKNOWN_ID;
615  leader = false;
616  }
617 };
618 
619 class kmp_topology_t {
620 
621  struct flags_t {
622  int uniform : 1;
623  int reserved : 31;
624  };
625 
626  int depth;
627 
628  // The following arrays are all 'depth' long
629 
630  // Orderd array of the types in the topology
631  kmp_hw_t *types;
632 
633  // Keep quick topology ratios, for non-uniform topologies,
634  // this ratio holds the max number of itemAs per itemB
635  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
636  int *ratio;
637 
638  // Storage containing the absolute number of each topology layer
639  int *count;
640 
641  // The hardware threads array
642  // hw_threads is num_hw_threads long
643  // Each hw_thread's ids and sub_ids are depth deep
644  int num_hw_threads;
645  kmp_hw_thread_t *hw_threads;
646 
647  // Equivalence hash where the key is the hardware topology item
648  // and the value is the equivalent hardware topology type in the
649  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
650  // known equivalence for the topology type
651  kmp_hw_t equivalent[KMP_HW_LAST];
652 
653  // Flags describing the topology
654  flags_t flags;
655 
656  // Count each item & get the num x's per y
657  // e.g., get the number of cores and the number of threads per core
658  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
659  void _gather_enumeration_information();
660 
661  // Remove layers that don't add information to the topology.
662  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
663  void _remove_radix1_layers();
664 
665  // Find out if the topology is uniform
666  void _discover_uniformity();
667 
668  // Set all the sub_ids for each hardware thread
669  void _set_sub_ids();
670 
671  // Set global affinity variables describing the number of threads per
672  // core, the number of packages, the number of cores per package, and
673  // the number of cores.
674  void _set_globals();
675 
676  // Set the last level cache equivalent type
677  void _set_last_level_cache();
678 
679 public:
680  // Force use of allocate()/deallocate()
681  kmp_topology_t() = delete;
682  kmp_topology_t(const kmp_topology_t &t) = delete;
683  kmp_topology_t(kmp_topology_t &&t) = delete;
684  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
685  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
686 
687  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
688  static void deallocate(kmp_topology_t *);
689 
690  // Functions used in create_map() routines
691  kmp_hw_thread_t &at(int index) {
692  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
693  return hw_threads[index];
694  }
695  const kmp_hw_thread_t &at(int index) const {
696  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
697  return hw_threads[index];
698  }
699  int get_num_hw_threads() const { return num_hw_threads; }
700  void sort_ids() {
701  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
702  kmp_hw_thread_t::compare_ids);
703  }
704  // Check if the hardware ids are unique, if they are
705  // return true, otherwise return false
706  bool check_ids() const;
707 
708  // Function to call after the create_map() routine
709  void canonicalize();
710  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
711 
712  // Functions used after canonicalize() called
713  bool filter_hw_subset();
714  bool is_close(int hwt1, int hwt2, int level) const;
715  bool is_uniform() const { return flags.uniform; }
716  // Tell whether a type is a valid type in the topology
717  // returns KMP_HW_UNKNOWN when there is no equivalent type
718  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
719  // Set type1 = type2
720  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
721  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
722  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
723  kmp_hw_t real_type2 = equivalent[type2];
724  if (real_type2 == KMP_HW_UNKNOWN)
725  real_type2 = type2;
726  equivalent[type1] = real_type2;
727  // This loop is required since any of the types may have been set to
728  // be equivalent to type1. They all must be checked and reset to type2.
729  KMP_FOREACH_HW_TYPE(type) {
730  if (equivalent[type] == type1) {
731  equivalent[type] = real_type2;
732  }
733  }
734  }
735  // Calculate number of types corresponding to level1
736  // per types corresponding to level2 (e.g., number of threads per core)
737  int calculate_ratio(int level1, int level2) const {
738  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
739  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
740  int r = 1;
741  for (int level = level1; level > level2; --level)
742  r *= ratio[level];
743  return r;
744  }
745  int get_ratio(int level) const {
746  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
747  return ratio[level];
748  }
749  int get_depth() const { return depth; };
750  kmp_hw_t get_type(int level) const {
751  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
752  return types[level];
753  }
754  int get_level(kmp_hw_t type) const {
755  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
756  int eq_type = equivalent[type];
757  if (eq_type == KMP_HW_UNKNOWN)
758  return -1;
759  for (int i = 0; i < depth; ++i)
760  if (types[i] == eq_type)
761  return i;
762  return -1;
763  }
764  int get_count(int level) const {
765  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
766  return count[level];
767  }
768 #if KMP_AFFINITY_SUPPORTED
769  void sort_compact() {
770  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
771  kmp_hw_thread_t::compare_compact);
772  }
773 #endif
774  void print(const char *env_var = "KMP_AFFINITY") const;
775  void dump() const;
776 };
777 
778 class kmp_hw_subset_t {
779 public:
780  struct item_t {
781  int num;
782  kmp_hw_t type;
783  int offset;
784  };
785 
786 private:
787  int depth;
788  int capacity;
789  item_t *items;
790  kmp_uint64 set;
791  bool absolute;
792  // The set must be able to handle up to KMP_HW_LAST number of layers
793  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
794 
795 public:
796  // Force use of allocate()/deallocate()
797  kmp_hw_subset_t() = delete;
798  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
799  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
800  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
801  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
802 
803  static kmp_hw_subset_t *allocate() {
804  int initial_capacity = 5;
805  kmp_hw_subset_t *retval =
806  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
807  retval->depth = 0;
808  retval->capacity = initial_capacity;
809  retval->set = 0ull;
810  retval->absolute = false;
811  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
812  return retval;
813  }
814  static void deallocate(kmp_hw_subset_t *subset) {
815  __kmp_free(subset->items);
816  __kmp_free(subset);
817  }
818  void set_absolute() { absolute = true; }
819  bool is_absolute() const { return absolute; }
820  void push_back(int num, kmp_hw_t type, int offset) {
821  if (depth == capacity - 1) {
822  capacity *= 2;
823  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
824  for (int i = 0; i < depth; ++i)
825  new_items[i] = items[i];
826  __kmp_free(items);
827  items = new_items;
828  }
829  items[depth].num = num;
830  items[depth].type = type;
831  items[depth].offset = offset;
832  depth++;
833  set |= (1ull << type);
834  }
835  int get_depth() const { return depth; }
836  const item_t &at(int index) const {
837  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
838  return items[index];
839  }
840  item_t &at(int index) {
841  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
842  return items[index];
843  }
844  void remove(int index) {
845  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
846  set &= ~(1ull << items[index].type);
847  for (int j = index + 1; j < depth; ++j) {
848  items[j - 1] = items[j];
849  }
850  depth--;
851  }
852  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
853  void dump() const {
854  printf("**********************\n");
855  printf("*** kmp_hw_subset: ***\n");
856  printf("* depth: %d\n", depth);
857  printf("* items:\n");
858  for (int i = 0; i < depth; ++i) {
859  printf("num: %d, type: %s, offset: %d\n", items[i].num,
860  __kmp_hw_get_keyword(items[i].type), items[i].offset);
861  }
862  printf("* set: 0x%llx\n", set);
863  printf("* absolute: %d\n", absolute);
864  printf("**********************\n");
865  }
866 };
867 
868 extern kmp_topology_t *__kmp_topology;
869 extern kmp_hw_subset_t *__kmp_hw_subset;
870 
871 /* A structure for holding machine-specific hierarchy info to be computed once
872  at init. This structure represents a mapping of threads to the actual machine
873  hierarchy, or to our best guess at what the hierarchy might be, for the
874  purpose of performing an efficient barrier. In the worst case, when there is
875  no machine hierarchy information, it produces a tree suitable for a barrier,
876  similar to the tree used in the hyper barrier. */
877 class hierarchy_info {
878 public:
879  /* Good default values for number of leaves and branching factor, given no
880  affinity information. Behaves a bit like hyper barrier. */
881  static const kmp_uint32 maxLeaves = 4;
882  static const kmp_uint32 minBranch = 4;
888  kmp_uint32 maxLevels;
889 
894  kmp_uint32 depth;
895  kmp_uint32 base_num_threads;
896  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
897  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
898  // 2=initialization in progress
899  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
900 
905  kmp_uint32 *numPerLevel;
906  kmp_uint32 *skipPerLevel;
907 
908  void deriveLevels() {
909  int hier_depth = __kmp_topology->get_depth();
910  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
911  numPerLevel[level] = __kmp_topology->get_ratio(i);
912  }
913  }
914 
915  hierarchy_info()
916  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
917 
918  void fini() {
919  if (!uninitialized && numPerLevel) {
920  __kmp_free(numPerLevel);
921  numPerLevel = NULL;
922  uninitialized = not_initialized;
923  }
924  }
925 
926  void init(int num_addrs) {
927  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
928  &uninitialized, not_initialized, initializing);
929  if (bool_result == 0) { // Wait for initialization
930  while (TCR_1(uninitialized) != initialized)
931  KMP_CPU_PAUSE();
932  return;
933  }
934  KMP_DEBUG_ASSERT(bool_result == 1);
935 
936  /* Added explicit initialization of the data fields here to prevent usage of
937  dirty value observed when static library is re-initialized multiple times
938  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
939  OpenMP). */
940  depth = 1;
941  resizing = 0;
942  maxLevels = 7;
943  numPerLevel =
944  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
945  skipPerLevel = &(numPerLevel[maxLevels]);
946  for (kmp_uint32 i = 0; i < maxLevels;
947  ++i) { // init numPerLevel[*] to 1 item per level
948  numPerLevel[i] = 1;
949  skipPerLevel[i] = 1;
950  }
951 
952  // Sort table by physical ID
953  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
954  deriveLevels();
955  } else {
956  numPerLevel[0] = maxLeaves;
957  numPerLevel[1] = num_addrs / maxLeaves;
958  if (num_addrs % maxLeaves)
959  numPerLevel[1]++;
960  }
961 
962  base_num_threads = num_addrs;
963  for (int i = maxLevels - 1; i >= 0;
964  --i) // count non-empty levels to get depth
965  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
966  depth++;
967 
968  kmp_uint32 branch = minBranch;
969  if (numPerLevel[0] == 1)
970  branch = num_addrs / maxLeaves;
971  if (branch < minBranch)
972  branch = minBranch;
973  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
974  while (numPerLevel[d] > branch ||
975  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
976  if (numPerLevel[d] & 1)
977  numPerLevel[d]++;
978  numPerLevel[d] = numPerLevel[d] >> 1;
979  if (numPerLevel[d + 1] == 1)
980  depth++;
981  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
982  }
983  if (numPerLevel[0] == 1) {
984  branch = branch >> 1;
985  if (branch < 4)
986  branch = minBranch;
987  }
988  }
989 
990  for (kmp_uint32 i = 1; i < depth; ++i)
991  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
992  // Fill in hierarchy in the case of oversubscription
993  for (kmp_uint32 i = depth; i < maxLevels; ++i)
994  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
995 
996  uninitialized = initialized; // One writer
997  }
998 
999  // Resize the hierarchy if nproc changes to something larger than before
1000  void resize(kmp_uint32 nproc) {
1001  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1002  while (bool_result == 0) { // someone else is trying to resize
1003  KMP_CPU_PAUSE();
1004  if (nproc <= base_num_threads) // happy with other thread's resize
1005  return;
1006  else // try to resize
1007  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1008  }
1009  KMP_DEBUG_ASSERT(bool_result != 0);
1010  if (nproc <= base_num_threads)
1011  return; // happy with other thread's resize
1012 
1013  // Calculate new maxLevels
1014  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1015  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1016  // First see if old maxLevels is enough to contain new size
1017  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1018  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1019  numPerLevel[i - 1] *= 2;
1020  old_sz *= 2;
1021  depth++;
1022  }
1023  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1024  while (nproc > old_sz) {
1025  old_sz *= 2;
1026  incs++;
1027  depth++;
1028  }
1029  maxLevels += incs;
1030 
1031  // Resize arrays
1032  kmp_uint32 *old_numPerLevel = numPerLevel;
1033  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1034  numPerLevel = skipPerLevel = NULL;
1035  numPerLevel =
1036  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1037  skipPerLevel = &(numPerLevel[maxLevels]);
1038 
1039  // Copy old elements from old arrays
1040  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1041  // init numPerLevel[*] to 1 item per level
1042  numPerLevel[i] = old_numPerLevel[i];
1043  skipPerLevel[i] = old_skipPerLevel[i];
1044  }
1045 
1046  // Init new elements in arrays to 1
1047  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1048  // init numPerLevel[*] to 1 item per level
1049  numPerLevel[i] = 1;
1050  skipPerLevel[i] = 1;
1051  }
1052 
1053  // Free old arrays
1054  __kmp_free(old_numPerLevel);
1055  }
1056 
1057  // Fill in oversubscription levels of hierarchy
1058  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1059  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1060 
1061  base_num_threads = nproc;
1062  resizing = 0; // One writer
1063  }
1064 };
1065 #endif // KMP_AFFINITY_H