MP-Gadget  5.0.1.dev1-76bc7d4726-dirty
domain.c
Go to the documentation of this file.
1 #include <mpi.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <stddef.h>
5 #include <string.h>
6 #include <math.h>
7 #include <stdint.h>
8 #include <omp.h>
9 
10 #include "utils.h"
11 
12 #include "domain.h"
13 #include "timestep.h"
14 #include "timebinmgr.h"
15 #include "exchange.h"
16 #include "slotsmanager.h"
17 #include "partmanager.h"
18 #include "walltime.h"
19 #include "utils/paramset.h"
20 #include "utils/peano.h"
21 #include "utils/mpsort.h"
22 
23 #define TAG_GRAV_A 18
24 #define TAG_GRAV_B 19
25 
49 typedef struct {
52  MPI_Comm GlobalSortComm;
54  int PreSort;
55  int NTopLeaves;
57 
58 /* It is important for the stability of the code that this struct is 64-bit aligned!*/
60 {
61  /*These members are copied into topnode_data*/
63  int Shift;
64  int Daughter;
65  /*Below members are only used in this file*/
66  int Parent;
67  int64_t Count; /* the number of 'subsample' particles in this top-level node */
68  int64_t Cost; /* the cost of 'subsample' particle in this top-level node */
69 };
70 
72 {
74  int64_t Cost;
75 };
76 
77 /*This is a helper for the tests*/
79 {
80  domain_params = dp;
81 }
82 
83 /*Set the parameters of the domain module*/
85 {
86  int ThisTask;
87  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
88  if(ThisTask == 0) {
89  domain_params.DomainOverDecompositionFactor = param_get_int(ps, "DomainOverDecompositionFactor");
90  /* Create one domain per thread. This helps the balance and makes the treebuild merge faster*/
92  domain_params.DomainOverDecompositionFactor = omp_get_max_threads();
95  domain_params.TopNodeAllocFactor = param_get_double(ps, "TopNodeAllocFactor");
96  domain_params.DomainUseGlobalSorting = param_get_int(ps, "DomainUseGlobalSorting");
98  if((param_get_int(ps, "StarformationOn") && param_get_double(ps, "QuickLymanAlphaProbability") == 0.)
99  || param_get_int(ps, "BlackHoleOn"))
101  }
102  MPI_Bcast(&domain_params, sizeof(DomainParams), MPI_BYTE, 0, MPI_COMM_WORLD);
103 }
104 
105 static int
106 order_by_key(const void *a, const void *b);
107 static void
108 mp_order_by_key(const void * data, void * radix, void * arg);
109 
110 static void
111 domain_assign_balanced(DomainDecomp * ddecomp, int64_t * cost, const int NsegmentPerTask);
112 
113 static int domain_allocate(DomainDecomp * ddecomp, DomainDecompositionPolicy * policy);
114 
115 static int
116 domain_check_memory_bound(const DomainDecomp * ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount);
117 
118 static int domain_attempt_decompose(DomainDecomp * ddecomp, DomainDecompositionPolicy * policy, const int MaxTopNodes);
119 
120 static int
121 domain_balance(DomainDecomp * ddecomp);
122 
123 static int domain_determine_global_toptree(DomainDecompositionPolicy * policy, struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm);
124 
125 static void
126 domain_compute_costs(const DomainDecomp * ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount);
127 
128 static void
129 domain_toptree_merge(struct local_topnode_data *treeA, struct local_topnode_data *treeB, int noA, int noB, int * treeASize, const int MaxTopNodes);
130 
132  DomainDecompositionPolicy * policy,
133  struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes
134  );
135 
136 static int
137 domain_global_refine(struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes, int64_t countlimit, int64_t costlimit);
138 
139 static void
140 domain_create_topleaves(DomainDecomp * ddecomp, int no, int * next);
141 
142 static int domain_layoutfunc(int n, const void * userdata);
143 
144 static int
146  const int NincreaseAlloc,
147  const int SwitchToGlobal);
148 
156 {
157  static DomainDecompositionPolicy policies[16];
158  static int Npolicies = 0;
159 
160  /* start from last successful policy to avoid retries */
161  static int LastSuccessfulPolicy = 0;
162 
163  if (Npolicies == 0) {
164  const int NincreaseAlloc = 16;
165  Npolicies = domain_policies_init(policies, NincreaseAlloc, 4);
166  }
167 
168  walltime_measure("/Misc");
169 
170  message(0, "domain decomposition... (presently allocated=%g MB)\n", mymalloc_usedbytes() / (1024.0 * 1024.0));
171 
172  int decompose_failed = 1;
173  int i;
174  for(i = LastSuccessfulPolicy; i < Npolicies; i ++)
175  {
176  domain_free(ddecomp);
177 
178 #ifdef DEBUG
180 #endif
181  int MaxTopNodes = domain_allocate(ddecomp, &policies[i]);
182 
183  message(0, "Attempting new domain decomposition policy: TopNodeAllocFactor=%g, UseglobalSort=%d, SubSampleDistance=%d UsePreSort=%d\n",
184  policies[i].TopNodeAllocFactor, policies[i].UseGlobalSort, policies[i].SubSampleDistance, policies[i].PreSort);
185 
186  decompose_failed = domain_attempt_decompose(ddecomp, &policies[i], MaxTopNodes);
187  decompose_failed = MPIU_Any(decompose_failed, ddecomp->DomainComm);
188 
189  if(decompose_failed)
190  continue;
191 
192  LastSuccessfulPolicy = i;
193  if(domain_balance(ddecomp) == 0)
194  break;
195  }
196 
197  if(decompose_failed) {
198  endrun(0, "No suitable domain decomposition policy worked for this particle distribution\n");
199  }
200 
201  /* copy the used nodes from temp to the true. */
202  struct topleaf_data * OldTopLeaves = ddecomp->TopLeaves;
203  struct topnode_data * OldTopNodes = ddecomp->TopNodes;
204 
205  ddecomp->TopNodes = (struct topnode_data *) mymalloc2("TopNodes", sizeof(ddecomp->TopNodes[0]) * ddecomp->NTopNodes);
206  /* add 1 extra to mark the end of TopLeaves; see assign */
207  ddecomp->TopLeaves = (struct topleaf_data *) mymalloc2("TopLeaves", sizeof(ddecomp->TopLeaves[0]) * (ddecomp->NTopLeaves + 1));
208 
209  memcpy(ddecomp->TopLeaves, OldTopLeaves, ddecomp->NTopLeaves* sizeof(ddecomp->TopLeaves[0]));
210  memcpy(ddecomp->TopNodes, OldTopNodes, ddecomp->NTopNodes * sizeof(ddecomp->TopNodes[0]));
211 
212  /* no longer useful */
213  myfree(OldTopLeaves);
214  myfree(OldTopNodes);
215 
216  if(domain_exchange(domain_layoutfunc, ddecomp, 0, NULL, PartManager, SlotsManager, 10000, ddecomp->DomainComm))
217  endrun(1929,"Could not exchange particles\n");
218 
219  /*Do a garbage collection so that the slots are ordered
220  *the same as the particles, garbage is at the end and all particles are in peano order.*/
222 
223  /*Ensure collective*/
224  MPIU_Barrier(ddecomp->DomainComm);
225  message(0, "Domain decomposition done.\n");
226 
227  report_memory_usage("DOMAIN");
228 
229  walltime_measure("/Domain/PeanoSort");
230 }
231 
232 /* This is a cut-down version of the domain decomposition that leaves the
233  * domain grid intact, but exchanges the particles and rebuilds the tree */
234 void domain_maintain(DomainDecomp * ddecomp, struct DriftData * drift)
235 {
236  message(0, "Attempting a domain exchange\n");
237 
238  walltime_measure("/Misc");
239 
240  /* Try a domain exchange.
241  * If we have no memory for the particles,
242  * bail and do a full domain*/
243  if(0 != domain_exchange(domain_layoutfunc, ddecomp, 0, drift, PartManager, SlotsManager, 10000, ddecomp->DomainComm)) {
244  domain_decompose_full(ddecomp);
245  return;
246  }
247 }
248 
249 /* this function generates several domain decomposition policies for attempting
250  * creating the domain. */
251 static int
253  const int NincreaseAlloc,
254  const int SwitchToGlobal)
255 {
256  int NTask;
257  MPI_Comm_size(MPI_COMM_WORLD, &NTask);
258 
259  int i;
260  for(i = 0; i < NincreaseAlloc; i ++) {
261  policies[i].TopNodeAllocFactor = domain_params.TopNodeAllocFactor * pow(1.3, i);
262  /* global sorting is slower than a local sorting, but tends to produce a more
263  * balanced domain tree that is easier to merge.
264  * */
266  if(i >= SwitchToGlobal)
267  policies[i].UseGlobalSort = 1;
268  /* The extent of the global sorting may be different from the extent of the Domain communicator*/
269  policies[i].GlobalSortComm = MPI_COMM_WORLD;
270  /* global sorting of particles is slow, so we add a slower presort to even the local
271  * particle distribution before subsampling, improves the balance, too */
272  policies[i].PreSort = 0;
273  if(i >= 1)
274  policies[i].PreSort = 1;
275  policies[i].SubSampleDistance = 16;
276  /* Desired number of TopLeaves should scale like the total number of processors*/
278  }
279 
280  return NincreaseAlloc;
281 }
282 
284 static int
286 {
287  size_t bytes, all_bytes = 0;
288 
289  int MaxTopNodes = (int) (policy->TopNodeAllocFactor * PartManager->MaxPart + 1);
290 
291  /* Build the domain over the global all-processors communicator.
292  * We use a symbol in case we want to do fancy things in the future.*/
293  ddecomp->DomainComm = MPI_COMM_WORLD;
294 
295  int NTask;
296  MPI_Comm_size(ddecomp->DomainComm, &NTask);
297 
298  /* Add a tail item to avoid special treatments */
299  ddecomp->Tasks = (struct task_data *) mymalloc2("Tasks", bytes = ((NTask + 1)* sizeof(ddecomp->Tasks[0])));
300 
301  all_bytes += bytes;
302 
303  ddecomp->TopNodes = (struct topnode_data *) mymalloc("TopNodes",
304  bytes = (MaxTopNodes * (sizeof(ddecomp->TopNodes[0]))));
305 
306  all_bytes += bytes;
307 
308  ddecomp->TopLeaves = (struct topleaf_data *) mymalloc("TopLeaves",
309  bytes = (MaxTopNodes * sizeof(ddecomp->TopLeaves[0])));
310 
311  all_bytes += bytes;
312 
313  message(0, "Allocated %g MByte for top-level domain structure\n", all_bytes / (1024.0 * 1024.0));
314 
315  ddecomp->domain_allocated_flag = 1;
316 
317  return MaxTopNodes;
318 }
319 
320 void domain_free(DomainDecomp * ddecomp)
321 {
322  if(ddecomp->domain_allocated_flag)
323  {
324  myfree(ddecomp->TopLeaves);
325  myfree(ddecomp->TopNodes);
326  myfree(ddecomp->Tasks);
327  ddecomp->domain_allocated_flag = 0;
328  }
329 }
330 
337 static int
338 domain_attempt_decompose(DomainDecomp * ddecomp, DomainDecompositionPolicy * policy, const int MaxTopNodes)
339 {
340  int i;
341 
342  size_t bytes, all_bytes = 0;
343 
344  /* points to the root node of the top-level tree */
345  struct local_topnode_data *topTree = (struct local_topnode_data *) mymalloc("LocaltopTree", bytes =
346  (MaxTopNodes * sizeof(struct local_topnode_data)));
347  memset(topTree, 0, sizeof(topTree[0]) * MaxTopNodes);
348  all_bytes += bytes;
349 
350  message(0, "use of %g MB of temporary storage for domain decomposition... (presently allocated=%g MB)\n",
351  all_bytes / (1024.0 * 1024.0), mymalloc_usedbytes() / (1024.0 * 1024.0));
352 
353  report_memory_usage("DOMAIN");
354 
355  walltime_measure("/Domain/Decompose/Misc");
356 
357  if(domain_determine_global_toptree(policy, topTree, &ddecomp->NTopNodes, MaxTopNodes, ddecomp->DomainComm)) {
358  myfree(topTree);
359  return 1;
360  }
361 
362  /* copy what we need for the topnodes */
363  for(i = 0; i < ddecomp->NTopNodes; i++)
364  {
365  ddecomp->TopNodes[i].StartKey = topTree[i].StartKey;
366  ddecomp->TopNodes[i].Shift = topTree[i].Shift;
367  ddecomp->TopNodes[i].Daughter = topTree[i].Daughter;
368  ddecomp->TopNodes[i].Leaf = -1; /* will be assigned by create_topleaves*/
369  }
370 
371  myfree(topTree);
372 
373  ddecomp->NTopLeaves = 0;
374  domain_create_topleaves(ddecomp, 0, &ddecomp->NTopLeaves);
375 
376  message(0, "NTopLeaves= %d NTopNodes=%d (space for %d)\n", ddecomp->NTopLeaves, ddecomp->NTopNodes, MaxTopNodes);
377 
378  walltime_measure("/Domain/DetermineTopTree/CreateLeaves");
379 
380  if(ddecomp->NTopLeaves < policy->NTopLeaves) {
381  message(0, "Number of Topleaves is less than required over decomposition");
382  }
383 
384  int NTask;
385  MPI_Comm_size(ddecomp->DomainComm, &NTask);
386 
387  /* this is fatal */
388  if(ddecomp->NTopLeaves < NTask) {
389  endrun(0, "Number of Topleaves is less than NTask");
390  }
391 
392  return 0;
393 }
394 
399 static int
401 {
403  int64_t * TopLeafCount = (int64_t *) mymalloc("TopLeafCount", ddecomp->NTopLeaves * sizeof(TopLeafCount[0]));
404 
405  domain_compute_costs(ddecomp, NULL, TopLeafCount);
406 
407  walltime_measure("/Domain/Decompose/Sumcost");
408 
409  /* first try work balance */
410  domain_assign_balanced(ddecomp, TopLeafCount, 1);
411 
412  walltime_measure("/Domain/Decompose/assignbalance");
413 
414  int status = domain_check_memory_bound(ddecomp, NULL, TopLeafCount);
415  if(status != 0)
416  message(0, "Domain decomposition is outside memory bounds.\n");
417 
418  walltime_measure("/Domain/Decompose/memorybound");
419 
420  myfree(TopLeafCount);
421 
422  return status;
423 }
424 
425 static int
426 domain_check_memory_bound(const DomainDecomp * ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
427 {
428  int NTask;
429  MPI_Comm_size(ddecomp->DomainComm, &NTask);
430 
431  /*Only used if the memory bound is not met */
432  int64_t * list_load = ta_malloc("list_load",int64_t, NTask);
433  int64_t * list_work = ta_malloc("list_work",int64_t, NTask);
434 
435  int64_t max_work = 0, max_load = 0, sumload = 0, sumwork = 0;
436  int ta;
437 
438  #pragma omp parallel for reduction(+: sumwork) reduction(+: sumload) reduction(max: max_load) reduction(max:max_work)
439  for(ta = 0; ta < NTask; ta++)
440  {
441  int64_t load = 0;
442  int64_t work = 0;
443  int i;
444  for(i = ddecomp->Tasks[ta].StartLeaf; i < ddecomp->Tasks[ta].EndLeaf; i ++)
445  {
446  load += TopLeafCount[i];
447  if(TopLeafWork)
448  work += TopLeafWork[i];
449  }
450 
451  list_load[ta] = load;
452  list_work[ta] = work;
453 
454  sumwork += work;
455  sumload += load;
456 
457  if(load > max_load)
458  max_load = load;
459  if(work > max_work)
460  max_work = work;
461  }
462 
463  if(TopLeafWork)
464  message(0, "Largest load: work=%g particle=%g\n",
465  max_work / ((double)sumwork / NTask), max_load / (((double) sumload) / NTask));
466  else
467  message(0, "Largest particle load particle=%g\n", max_load / (((double) sumload) / NTask));
468 
469  /*Leave a small number of particles for star formation */
471  {
472  message(0, "desired memory imbalance=%g (limit=%g, needed=%ld)\n",
473  (max_load * ((double) sumload ) / NTask ) / PartManager->MaxPart, domain_params.SetAsideFactor * PartManager->MaxPart, max_load);
474  message(0, "Balance breakdown:\n");
475  int i;
476  for(i = 0; i < NTask; i++)
477  {
478  message(0, "Task: [%3d] work=%8.4f particle load=%8.4f\n", i,
479  list_work[i] / ((double) sumwork / NTask), list_load[i] / (((double) sumload) / NTask));
480  }
481  return 1;
482  }
483  ta_free(list_work);
484  ta_free(list_load);
485  return 0;
486 }
487 
488 
489 /* Data of TopTree leaf nodes that are used for assignment to tasks */
492  int Task;
493  int topnode;
494  int64_t cost;
495 };
496 
497 static int
498 topleaf_ext_order_by_task_and_key(const void * c1, const void * c2)
499 {
500  const struct topleaf_extdata * p1 = (const struct topleaf_extdata *) c1;
501  const struct topleaf_extdata * p2 = (const struct topleaf_extdata *) c2;
502  if(p1->Task < p2->Task) return -1;
503  if(p1->Task > p2->Task) return 1;
504  if(p1->Key < p2->Key) return -1;
505  if(p1->Key > p2->Key) return 1;
506  return 0;
507 }
508 
509 static int
510 topleaf_ext_order_by_key(const void * c1, const void * c2)
511 {
512  const struct topleaf_extdata * p1 = (const struct topleaf_extdata *) c1;
513  const struct topleaf_extdata * p2 = (const struct topleaf_extdata *) c2;
514  if(p1->Key < p2->Key) return -1;
515  if(p1->Key > p2->Key) return 1;
516  return 0;
517 }
518 
519 
532 static void
533 domain_assign_balanced(DomainDecomp * ddecomp, int64_t * cost, const int NsegmentPerTask)
534 {
535  int NTask;
536  MPI_Comm_size(ddecomp->DomainComm, &NTask);
537 
538  int Nsegment = NTask * NsegmentPerTask;
539  /* we work with TopLeafExt then replace TopLeaves*/
540 
541  struct topleaf_extdata * TopLeafExt;
542 
543  /* A Segment is a subset of the TopLeaf nodes */
544 
545  TopLeafExt = (struct topleaf_extdata *) mymalloc("TopLeafExt", ddecomp->NTopLeaves * sizeof(TopLeafExt[0]));
546 
547  /* copy the data over */
548  int i;
549  for(i = 0; i < ddecomp->NTopLeaves; i ++) {
550  TopLeafExt[i].topnode = ddecomp->TopLeaves[i].topnode;
551  TopLeafExt[i].Key = ddecomp->TopNodes[ddecomp->TopLeaves[i].topnode].StartKey;
552  TopLeafExt[i].Task = -1;
553  TopLeafExt[i].cost = cost[i];
554  }
555 
556  /* make sure TopLeaves are sorted by Key for locality of segments -
557  * likely not necessary be cause when this function
558  * is called it is already true */
559  qsort_openmp(TopLeafExt, ddecomp->NTopLeaves, sizeof(TopLeafExt[0]), topleaf_ext_order_by_key);
560 
561  int64_t totalcost = 0;
562  #pragma omp parallel for reduction(+ : totalcost)
563  for(i = 0; i < ddecomp->NTopLeaves; i ++) {
564  totalcost += TopLeafExt[i].cost;
565  }
566  int64_t totalcostLeft = totalcost;
567 
568  /* start the assignment; We try to create segments that are of the
569  * mean_expected cost, then assign them to Tasks in a round-robin fashion.
570  * The default is that there is one segment per task, which makes the assignment trivial.
571  */
572 
573  double mean_expected = 1.0 * totalcost / Nsegment;
574  double mean_task = 1.0 * totalcost /NTask;
575  int curleaf = 0;
576  int curseg = 0;
577  int curtask = 0; /* between 0 and NTask - 1*/
578  int nrounds = 0; /*Number of times we looped*/
579  int64_t curload = 0; /* cumulative load for the current segment */
580  int64_t curtaskload = 0; /* cumulative load for current task */
581  int64_t maxleafcost = 0;
582 
583  message(0, "Expected segment cost %g\n", mean_expected);
584  /* we maintain that after the loop curleaf is the number of leaves scanned,
585  * curseg is number of segments created.
586  * */
587  while(nrounds < ddecomp->NTopLeaves) {
588  int append = 0;
589  int advance = 0;
590  if(curleaf == ddecomp->NTopLeaves) {
591  /* to maintain the invariance */
592  advance = 1;
593  } else if(ddecomp->NTopLeaves - curleaf == Nsegment - curseg) {
594  /* just enough for one segment per leaf; this line ensures
595  * at least Nsegment segments are created. */
596  append = 1;
597  advance = 1;
598  } else {
599  /* append a leaf to the segment if there is room left.
600  * Calculate room left based on a rolling average of the total so far..*/
601  int64_t totalassigned = (totalcost - totalcostLeft) + curload;
602  if((mean_expected * (curseg +1) - totalassigned > 0.5 * TopLeafExt[curleaf].cost)
603  || curload == 0 /* but at least add one leaf */
604  ) {
605  append = 1;
606  } else {
607  /* will be too big of a segment, cut it */
608  advance = 1;
609  }
610  }
611  if(append) {
612  /* assign the leaf to the task */
613  curload += TopLeafExt[curleaf].cost;
614  TopLeafExt[curleaf].Task = curtask;
615  curleaf ++;
616  }
617 
618  if(advance) {
619  /*Add this segment to the current task*/
620  curtaskload += curload;
621  /* If we have allocated enough segments to fill this processor,
622  * or if we have one segment per task left, proceed to the next task.
623  * We do not use round robin so that neighbouring (by key)
624  * topTree are on the same processor.*/
625  if((mean_task - curtaskload < 0.5 * mean_expected) || (Nsegment - curseg <= NTask - curtask)
626  ){
627  curtaskload = 0;
628  curtask ++;
629  }
630  /* move on to the next segment.*/
631  totalcostLeft -= curload;
632  curload = 0;
633  curseg ++;
634 
635  /* finished a round for all tasks */
636  if(curtask == NTask) {
637  /*Back to task zero*/
638  curtask = 0;
639  /* Need a new mean_expected value: we want to
640  * divide the remaining segments evenly between the processors.*/
641  mean_expected = 1.0 * totalcostLeft / Nsegment;
642  mean_task = 1.0 * totalcostLeft / NTask;
643  nrounds++;
644  }
645  if(curleaf == ddecomp->NTopLeaves)
646  break;
647  if(TopLeafExt[curleaf].cost > maxleafcost)
648  maxleafcost = TopLeafExt[curleaf].cost;
649  }
650  //message(0, "curleaf = %d advance = %d append = %d, curload = %d cost=%ld left=%ld\n", curleaf, advance, append, curload, TopLeafExt[curleaf].cost, totalcostLeft);
651  }
652 
653  /*In most 'normal' cases nrounds == 1 here*/
654  message(0, "Created %d segments in %d rounds. Max leaf cost: %g\n", curseg, nrounds, (1.0*maxleafcost)/(totalcost) * Nsegment);
655 
656  if(curseg < Nsegment) {
657  endrun(0, "Not enough segments were created (%d instead of %d). This should not happen.\n", curseg, Nsegment);
658  }
659 
660  if(totalcostLeft != 0) {
661  endrun(0, "Assertion failed. Total cost is not fully assigned to all ranks\n");
662  }
663 
664  /* lets rearrange the TopLeafExt by task, such that we can build the Tasks table */
665  qsort_openmp(TopLeafExt, ddecomp->NTopLeaves, sizeof(TopLeafExt[0]), topleaf_ext_order_by_task_and_key);
666  for(i = 0; i < ddecomp->NTopLeaves; i ++) {
667  ddecomp->TopNodes[TopLeafExt[i].topnode].Leaf = i;
668  ddecomp->TopLeaves[i].Task = TopLeafExt[i].Task;
669  ddecomp->TopLeaves[i].topnode = TopLeafExt[i].topnode;
670  }
671 
672  myfree(TopLeafExt);
673  /* here we reduce the number of code branches by adding an item to the end. */
674  ddecomp->TopLeaves[ddecomp->NTopLeaves].Task = NTask;
675  ddecomp->TopLeaves[ddecomp->NTopLeaves].topnode = -1;
676 
677  int ta = 0;
678  ddecomp->Tasks[ta].StartLeaf = 0;
679  for(i = 0; i <= ddecomp->NTopLeaves; i ++) {
680 
681  if(ddecomp->TopLeaves[i].Task == ta) continue;
682 
683  ddecomp->Tasks[ta].EndLeaf = i;
684  ta ++;
685  while(ta < ddecomp->TopLeaves[i].Task) {
686  ddecomp->Tasks[ta].EndLeaf = i;
687  ddecomp->Tasks[ta].StartLeaf = i;
688  ta ++;
689  }
690  /* the last item will set Tasks[NTask], but we allocated memory for it already */
691  ddecomp->Tasks[ta].StartLeaf = i;
692  }
693  if(ta != NTask) {
694  endrun(0, "Assertion failed: not all tasks are assigned. This indicates a bug.\n");
695  }
696 }
697 
706 static int
707 domain_layoutfunc(int n, const void * userdata) {
708  const DomainDecomp * ddecomp = (const DomainDecomp *) userdata;
709  peano_t key = P[n].Key;
710  int no = domain_get_topleaf(key, ddecomp);
711  return ddecomp->TopLeaves[no].Task;
712 }
713 
720 static void
721 domain_create_topleaves(DomainDecomp * ddecomp, int no, int * next)
722 {
723  int i;
724  if(ddecomp->TopNodes[no].Daughter == -1)
725  {
726  ddecomp->TopNodes[no].Leaf = *next;
727  ddecomp->TopLeaves[*next].topnode = no;
728  (*next)++;
729  }
730  else
731  {
732  for(i = 0; i < 8; i++)
733  domain_create_topleaves(ddecomp, ddecomp->TopNodes[no].Daughter + i, next);
734  }
735 }
736 
737 static int
739  peano_t key)
740 {
741  int no = 0;
742  while(topTree[no].Daughter >= 0) {
743  no = topTree[no].Daughter + ((key - topTree[no].StartKey) >> (topTree[no].Shift - 3));
744  }
745  return no;
746 }
747 
748 static int
750  peano_t Key, int64_t cost)
751 {
752  /* insert */
753  int leaf = domain_toptree_get_subnode(topTree, Key);
754  topTree[leaf].Count ++;
755  topTree[leaf].Cost += cost;
756  return leaf;
757 }
758 
759 static int
760 domain_toptree_split(struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes,
761  int i)
762 {
763  int j;
764  /* we ran out of top nodes and must get more.*/
765  if((*topTreeSize + 8) > MaxTopNodes)
766  return 1;
767 
768  if(topTree[i].Shift < 3) {
769  endrun(1, "Failed to build a TopTree -- particles overly clustered.\n");
770  }
771 
772  /*Make a new topnode section attached to this node*/
773  topTree[i].Daughter = *topTreeSize;
774  (*topTreeSize) += 8;
775 
776  /* Initialise this topnode with new sub nodes*/
777  for(j = 0; j < 8; j++)
778  {
779  const int sub = topTree[i].Daughter + j;
780  /* The new sub nodes have this node as parent
781  * and no daughters.*/
782  topTree[sub].Daughter = -1;
783  topTree[sub].Parent = i;
784  /* Shorten the peano key by a factor 8, reflecting the oct-tree level.*/
785  topTree[sub].Shift = topTree[i].Shift - 3;
786  /* This is the region of peanospace covered by this node.*/
787  topTree[sub].StartKey = topTree[i].StartKey + j * (1L << topTree[sub].Shift);
788  /* We will compute the cost in the node below.*/
789  topTree[sub].Count = 0;
790  topTree[sub].Cost = 0;
791  }
792  return 0;
793 }
794 
795 static void
796 domain_toptree_update_cost(struct local_topnode_data * topTree, int start)
797 {
798  if(topTree[start].Daughter == -1) return;
799 
800  int j = 0;
801  for(j = 0; j < 8; j ++) {
802  int sub = topTree[start].Daughter + j;
803  domain_toptree_update_cost(topTree, sub);
804  topTree[start].Count += topTree[sub].Count;
805  topTree[start].Cost += topTree[sub].Cost;
806  }
807 }
808 
809 /* This function recurively identify and terminate tree branches that are cheap.*/
810 static void
811 domain_toptree_truncate_r(struct local_topnode_data * topTree, int start, int64_t countlimit, int64_t costlimit)
812 {
813  if(topTree[start].Daughter == -1) return;
814 
815  if(topTree[start].Count < countlimit &&
816  topTree[start].Cost < costlimit) {
817  /* truncate here */
818  topTree[start].Daughter = -1;
819  return;
820  }
821 
822  int j;
823  for(j = 0; j < 8; j ++) {
824  int sub = topTree[start].Daughter + j;
825  domain_toptree_truncate_r(topTree, sub, countlimit, costlimit);
826  }
827 }
828 
829 /* remove the nodes that are no longer useful after the truncation.
830  *
831  * We walk the topTree top-down to collect useful nodes, and move them to
832  * the head of the topTree list.
833  *
834  * This works because any child node is stored after the parent in the list
835  * -- we are destroying the old tree just slow enough
836  * */
837 
838 static void
839 domain_toptree_garbage_collection(struct local_topnode_data * topTree, int start, int * last_free)
840 {
841 
842  if(topTree[start].Daughter == -1) return;
843  int j;
844 
845  int oldd = topTree[start].Daughter;
846  int newd = *last_free;
847 
848  topTree[start].Daughter = newd;
849 
850  (*last_free) += 8;
851 
852  /* copy first in case oldd + j is overwritten by the recursed gc
853  * if last_free is less than oldd */
854  for(j = 0; j < 8; j ++) {
855  topTree[newd + j] = topTree[oldd + j];
856  topTree[newd + j].Parent = start;
857  }
858 
859  for(j = 0; j < 8; j ++) {
860  domain_toptree_garbage_collection(topTree, newd + j, last_free);
861  }
862 }
863 
864 static void
866  struct local_topnode_data * topTree, int * topTreeSize,
867  int64_t countlimit, int64_t costlimit)
868 {
869 
870  /* first terminate the tree.*/
871  domain_toptree_truncate_r(topTree, 0, countlimit, costlimit);
872 
873  /* then remove the unused nodes from the topTree storage. This is important
874  * for efficient global merge */
875  *topTreeSize = 1; /* put in the root node -- it's never a garbage . */
876  domain_toptree_garbage_collection(topTree, 0, topTreeSize);
877 }
878 
894 static int
896  DomainDecompositionPolicy * policy,
897  struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes
898  )
899 {
900 
901  int i;
902 
903  struct local_particle_data * LP = (struct local_particle_data*) mymalloc("LocalParticleData", PartManager->NumPart * sizeof(LP[0]));
904 
905  /* Watchout : Peano/Morton ordering is required by the tree
906  * building algorithm in local_refine.
907  *
908  * We can either use a global or a local sorting here; the code will run
909  * without crashing.
910  *
911  * A global sorting is chosen to ensure the local topTrees are really local
912  * and the leaves almost disjoint. This makes the merged topTree a more accurate
913  * representation of the true cost / load distribution, for merging
914  * and secondary refinement are approximated.
915  *
916  * A local sorting may be faster but makes the tree less accurate due to
917  * more likely running into overlapped local topTrees.
918  * */
919 
920  int Nsample = PartManager->NumPart / policy->SubSampleDistance;
921 
922  if(Nsample == 0 && PartManager->NumPart != 0) Nsample = 1;
923 
924 #pragma omp parallel for
925  for(i = 0; i < PartManager->NumPart; i ++)
926  {
927  LP[i].Key = P[i].Key;
928  LP[i].Cost = 1;
929  }
930 
931  /* First sort to ensure spatially 'even' subsamples; FIXME: This can probably
932  * be omitted in most cases. Usually the particles in memory won't be very far off
933  * from a peano order. */
934  if(policy->PreSort)
936 
937  /* now subsample */
938  for(i = 0; i < Nsample; i ++)
939  {
940  LP[i].Key = LP[i * policy->SubSampleDistance].Key;
941  LP[i].Cost = LP[i * policy->SubSampleDistance].Cost;
942  }
943 
944  if(policy->UseGlobalSort) {
945  mpsort_mpi(LP, Nsample, sizeof(struct local_particle_data), mp_order_by_key, 8, NULL, policy->GlobalSortComm);
946  } else {
947  qsort_openmp(LP, Nsample, sizeof(struct local_particle_data), order_by_key);
948  }
949 
950  walltime_measure("/Domain/DetermineTopTree/Sort");
951 
952  *topTreeSize = 1;
953  topTree[0].Daughter = -1;
954  topTree[0].Parent = -1;
955  topTree[0].Shift = BITS_PER_DIMENSION * 3;
956  topTree[0].StartKey = 0;
957  topTree[0].Count = 0;
958  topTree[0].Cost = 0;
959 
960  /* The tree building algorithm here requires the LP structure to
961  * be sorted by key, in which case we either refine
962  * the node the last particle lives in, or jump into a new empty node,
963  * as the particles are scanned through.
964  *
965  * I unfortunately cannot find the direct reference of this with
966  * the proof. I found it around 2012~2013 when writing psphray2
967  * and didn't cite it properly then!
968  *
969  * Here is a blog link that is related:
970  *
971  * https://devblogs.nvidia.com/parallelforall/thinking-parallel-part-iii-tree-construction-gpu/
972  *
973  * A few interesting papers that may make this faster and cheaper.
974  *
975  * http://sc07.supercomputing.org/schedule/pdf/pap117.pdf
976  *
977  * */
978 
979  /* 1. create a skeleton of the toptree.
980  * We do not use all particles to avoid excessive memory consumption.
981  *
982  * */
983 
984 
985  /* During the loop, topTree[?].Count is a flag to indicate if
986  * the node has been finished. We either refine the last leaf node
987  * or create a new leaf because of the peano/morton sorting.
988  * */
989  peano_t last_key = -1;
990  int last_leaf = -1;
991  i = 0;
992  while(i < Nsample) {
993 
994  int leaf = domain_toptree_get_subnode(topTree, LP[i].Key);
995 
996  if (leaf == last_leaf && topTree[leaf].Shift >= 3) {
997  /* two particles in a node? need refinement if possible. */
998  if(0 != domain_toptree_split(topTree, topTreeSize, MaxTopNodes, leaf)) {
999  /* out of memory, retry */
1000  myfree(LP);
1001  return 1;
1002  }
1003  /* pop the last particle and reinsert it */
1004  topTree[leaf].Count = 0;
1005 
1006  last_leaf = domain_toptree_insert(topTree, last_key, 0);
1007  /* retry the current particle. */
1008  continue;
1009  } else {
1010  if(topTree[leaf].Count != 0 && leaf != last_leaf) {
1011  /* meeting a node that's already been finished ?
1012  * This shall not happen when key is already sorted;
1013  * due to the sorting.
1014  *
1015  * If the Key hasn't changed sufficiently with this new particle we will see the last leaf again.
1016  * Normally we would refine, but if Shift == 0 we don't have space.
1017  * In this case we just add the current particle sample to the last toptree node.
1018  */
1019  endrun(10, "toptree[%d].Count=%d, shift %d, last_leaf=%d key = %ld i= %d Nsample = %d\n",
1020  leaf, topTree[leaf].Count, topTree[leaf].Shift, last_leaf,LP[i].Key, i, Nsample);
1021  }
1022  /* this will create a new node. */
1023  last_key = LP[i].Key;
1024  last_leaf = domain_toptree_insert(topTree, last_key, 0);
1025  i += policy->SubSampleDistance;
1026  continue;
1027  }
1028  }
1029  /* Alternativly we could have kept last_cost in the above loop and avoid
1030  * the second and third step: */
1031 
1032  /* 2. Remove the skelton particles from the tree to make it a skeleton;
1033  * note that we never bothered to add Cost when the skeleton was built.
1034  * otherwise we shall clean it here too.*/
1035  for(i = 0; i < *topTreeSize; i ++ ) {
1036  topTree[i].Count = 0;
1037  }
1038 
1039  /* 3. insert all particles to the skeleton tree; Count will be correct because we cleaned them.*/
1040  for(i = 0; i < Nsample; i ++ ) {
1041  domain_toptree_insert(topTree, LP[i].Key, LP[i].Cost);
1042  }
1043 
1044  myfree(LP);
1045 
1046  /* then compute the costs of the internal nodes. */
1047  domain_toptree_update_cost(topTree, 0);
1048 
1049  /* we leave truncation in another function, for costlimit and countlimit must be
1050  * used in secondary refinement*/
1051  return 0;
1052 }
1053 
1054 /* Combine the toptree. Returns a (collective) error code which is non-zero if an error occured*/
1055 static int
1056 domain_nonrecursively_combine_topTree(struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm)
1057 {
1058  /*
1059  * combine topTree non recursively, this uses MPI_Bcast within a group.
1060  * it shall be quite a bit faster (~ x2) than the old recursive scheme.
1061  *
1062  * it takes less time at higher sep.
1063  *
1064  * The communication should have been done with MPI Inter communicator.
1065  * but I couldn't figure out how to do it that way.
1066  * */
1067  int sep = 1;
1068  int errorflag = 0;
1069  /*Number of tasks to decompose to*/
1070  int NTask;
1071  MPI_Comm_size(DomainComm, &NTask);
1072  /*Which task is this?*/
1073  int ThisTask;
1074  MPI_Comm_rank(DomainComm, &ThisTask);
1075 
1076  for(sep = 1; sep < NTask; sep *=2) {
1077  /* build the subcommunicators for broadcasting */
1078  int Color = ThisTask / sep;
1079  int Key = ThisTask % sep;
1080 
1081  /* non leaders will skip exchanges */
1082  if(Key != 0)
1083  continue;
1084 
1085  /* leaders of even color will combine nodes from next odd color,
1086  * so that when sep is increased eventually rank 0 will have all
1087  * nodes */
1088  if(Color % 2 == 0) {
1089  /* even guys recv */
1090  int recvTask = ThisTask + sep;
1091  int ntopnodes_import = 0;
1092  if(recvTask < NTask) {
1093  MPI_Recv(&ntopnodes_import, 1, MPI_INT, recvTask, TAG_GRAV_A,
1094  DomainComm, MPI_STATUS_IGNORE);
1095  if(ntopnodes_import < 0) {
1096  endrun(1, "severe domain error using a unintended rank \n");
1097  }
1098  int mergesize = ntopnodes_import;
1099  if(ntopnodes_import < *topTreeSize)
1100  mergesize = *topTreeSize;
1101  struct local_topnode_data * topTree_import = (struct local_topnode_data *) mymalloc("topTree_import",
1102  mergesize * sizeof(struct local_topnode_data));
1103 
1104  MPI_Recv(topTree_import,
1105  ntopnodes_import * sizeof(struct local_topnode_data), MPI_BYTE,
1106  recvTask, TAG_GRAV_B,
1107  DomainComm, MPI_STATUS_IGNORE);
1108 
1109  if((*topTreeSize + ntopnodes_import) > MaxTopNodes) {
1110  errorflag = 1;
1111  } else {
1112  if(ntopnodes_import > 0 ) {
1113  domain_toptree_merge(topTree, topTree_import, 0, 0, topTreeSize, MaxTopNodes);
1114  }
1115  }
1116  myfree(topTree_import);
1117  }
1118  } else {
1119  /* odd guys send */
1120  int recvTask = ThisTask - sep;
1121  if(recvTask >= 0) {
1122  MPI_Send(topTreeSize, 1, MPI_INT, recvTask, TAG_GRAV_A, DomainComm);
1123  MPI_Send(topTree, (*topTreeSize) * sizeof(struct local_topnode_data), MPI_BYTE,
1124  recvTask, TAG_GRAV_B, DomainComm);
1125  }
1126  *topTreeSize = -1;
1127  }
1128  }
1129 
1130  MPI_Bcast(topTreeSize, 1, MPI_INT, 0, DomainComm);
1131  /* Check that the merge succeeded*/
1132  if(*topTreeSize < 0 || *topTreeSize >= MaxTopNodes) {
1133  errorflag = 1;
1134  }
1135  int errorflagall = MPIU_Any(errorflag, DomainComm);
1136  if(errorflagall == 0)
1137  MPI_Bcast(topTree, (*topTreeSize) * sizeof(struct local_topnode_data), MPI_BYTE, 0, DomainComm);
1138  return errorflagall;
1139 }
1140 
1148  struct local_topnode_data * topTree, int * topTreeSize, int MaxTopNodes, MPI_Comm DomainComm)
1149 {
1150  walltime_measure("/Domain/DetermineTopTree/Misc");
1151 
1152  /*
1153  * Build local refinement with a subsample of particles
1154  * 1/16 is used because each local topTree node takes about 32 bytes.
1155  **/
1156 
1157  int local_refine_failed = domain_check_for_local_refine_subsample(policy, topTree, topTreeSize, MaxTopNodes);
1158 
1159  walltime_measure("/Domain/DetermineTopTree/LocalRefine/Init");
1160 
1161  if(MPIU_Any(local_refine_failed, DomainComm)) {
1162  message(0, "We are out of Topnodes. \n");
1163  return 1;
1164  }
1165 
1166  int64_t TotCost = 0, TotCount = 0;
1167  int64_t costlimit, countlimit;
1168 
1169  MPI_Allreduce(&topTree[0].Cost, &TotCost, 1, MPI_INT64, MPI_SUM, DomainComm);
1170  MPI_Allreduce(&topTree[0].Count, &TotCount, 1, MPI_INT64, MPI_SUM, DomainComm);
1171 
1172  costlimit = TotCost / (policy->NTopLeaves);
1173  countlimit = TotCount / (policy->NTopLeaves);
1174 
1175  domain_toptree_truncate(topTree, topTreeSize, countlimit, costlimit);
1176 
1177  walltime_measure("/Domain/DetermineTopTree/LocalRefine/truncate");
1178 
1179  if(*topTreeSize > 10 * policy->NTopLeaves) {
1180  message(1, "local TopTree Size =%d >> expected = %d; Usually this indicates very bad imbalance, due to a giant density peak.\n",
1181  *topTreeSize, policy->NTopLeaves);
1182  }
1183 
1184 #if 0
1185  char buf[1000];
1186  sprintf(buf, "topnodes.bin.%d", ThisTask);
1187  FILE * fd = fopen(buf, "w");
1188 
1189  fwrite(topTree, sizeof(struct local_topnode_data), *topTreeSize, fd);
1190  fclose(fd);
1191 
1192  //MPI_Barrier(DomainComm);
1193  //MPI_Abort(DomainComm, 0);
1194 #endif
1195 
1196  /* we now need to exchange tree parts and combine them as needed */
1197  int combine_failed = domain_nonrecursively_combine_topTree(topTree, topTreeSize, MaxTopNodes, DomainComm);
1198 
1199  walltime_measure("/Domain/DetermineTopTree/Combine");
1200 
1201  if(combine_failed)
1202  {
1203  message(0, "can't combine trees due to lack of storage. Will try again.\n");
1204  return 1;
1205  }
1206 
1207  /* now let's see whether we should still more refinements, based on the estimated cumulative cost/count in each cell */
1208 
1209  int global_refine_failed = domain_global_refine(topTree, topTreeSize, MaxTopNodes, countlimit, costlimit);
1210 
1211  walltime_measure("/Domain/DetermineTopTree/Addnodes");
1212 
1213  if(MPIU_Any(global_refine_failed, DomainComm)) {
1214  message(0, "Global refine failed: toptreeSize = %d, MaxTopNodes = %d\n", *topTreeSize, MaxTopNodes);
1215  return 1;
1216  }
1217 
1218  message(0, "Final local topTree size = %d per segment = %g.\n", *topTreeSize, 1.0 * (*topTreeSize) / (policy->NTopLeaves));
1219 
1220  return 0;
1221 }
1222 
1223 static int
1225  struct local_topnode_data * topTree, int * topTreeSize, const int MaxTopNodes,
1226  int64_t countlimit, int64_t costlimit)
1227 {
1228  int i;
1229 
1230  /* At this point we have refined the local particle tree so that each
1231  * topNode contains a Cost and Count below the cost threshold. We have then
1232  * done a global merge of the particle tree. Some of our topTree may now contain
1233  * more particles than the Cost threshold, but repeating refinement using the local
1234  * algorithm is complicated - particles inside any particular topNode may be
1235  * on another processor. So we do a local volume based refinement here. This
1236  * just cuts each topNode above the threshold into 8 equal-sized portions by
1237  * subdividing the peano key.
1238  *
1239  * NOTE: Just like the merge, this does not correctly preserve costs!
1240  * Costs are just divided by 8, because recomputing them for the daughter nodes
1241  * will be expensive.
1242  * In practice this seems to work fine, probably because the cost distribution
1243  * is not that unbalanced. */
1244 
1245  message(0, "local topTree size before appending=%d\n", *topTreeSize);
1246 
1247  /*Note that *topTreeSize will change inside the loop*/
1248  for(i = 0; i < *topTreeSize; i++)
1249  {
1250  /*If this node has no children and non-zero size*/
1251  if(topTree[i].Daughter >= 0 || topTree[i].Shift <= 0) continue;
1252 
1253  /*If this node is also more costly than the limit*/
1254  if(topTree[i].Count < countlimit && topTree[i].Cost < costlimit) continue;
1255 
1256  /*If we have no space for another 8 topTree, exit */
1257  if((*topTreeSize + 8) > MaxTopNodes) {
1258  return 1;
1259  }
1260 
1261  topTree[i].Daughter = *topTreeSize;
1262  int j;
1263  for(j = 0; j < 8; j++)
1264  {
1265  int sub = topTree[i].Daughter + j;
1266  topTree[sub].Shift = topTree[i].Shift - 3;
1267  topTree[sub].Count = topTree[i].Count / 8;
1268  topTree[sub].Cost = topTree[i].Cost / 8;
1269  topTree[sub].Daughter = -1;
1270  topTree[sub].Parent = i;
1271  topTree[sub].StartKey = topTree[i].StartKey + j * (1L << topTree[sub].Shift);
1272  }
1273  (*topTreeSize) += 8;
1274  }
1275  return 0;
1276 }
1277 
1278 
1279 static void
1280 domain_compute_costs(const DomainDecomp * ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
1281 {
1282  int i;
1283  int NumThreads = omp_get_max_threads();
1284  int64_t * local_TopLeafWork = NULL;
1285  if(TopLeafWork) {
1286  local_TopLeafWork = (int64_t *) mymalloc("local_TopLeafWork", NumThreads * ddecomp->NTopLeaves * sizeof(local_TopLeafWork[0]));
1287  memset(local_TopLeafWork, 0, NumThreads * ddecomp->NTopLeaves * sizeof(local_TopLeafWork[0]));
1288  }
1289  int64_t * local_TopLeafCount = (int64_t *) mymalloc("local_TopLeafCount", NumThreads * ddecomp->NTopLeaves * sizeof(local_TopLeafCount[0]));
1290  memset(local_TopLeafCount, 0, NumThreads * ddecomp->NTopLeaves * sizeof(local_TopLeafCount[0]));
1291 
1292 #pragma omp parallel
1293  {
1294  int tid = omp_get_thread_num();
1295  int n;
1296 
1297  #pragma omp for
1298  for(n = 0; n < PartManager->NumPart; n++)
1299  {
1300  /* Skip garbage particles: they have zero work
1301  * and can be removed by exchange if under memory pressure.*/
1302  if(P[n].IsGarbage)
1303  continue;
1304  int no = domain_get_topleaf(P[n].Key, ddecomp);
1305 
1306  if(local_TopLeafWork)
1307  local_TopLeafWork[no + tid * ddecomp->NTopLeaves] += 1;
1308 
1309  local_TopLeafCount[no + tid * ddecomp->NTopLeaves] += 1;
1310  }
1311  }
1312 
1313 #pragma omp parallel for
1314  for(i = 0; i < ddecomp->NTopLeaves; i++)
1315  {
1316  int tid;
1317  if(local_TopLeafWork)
1318  for(tid = 1; tid < NumThreads; tid++) {
1319  local_TopLeafWork[i] += local_TopLeafWork[i + tid * ddecomp->NTopLeaves];
1320  }
1321 
1322  for(tid = 1; tid < NumThreads; tid++) {
1323  local_TopLeafCount[i] += local_TopLeafCount[i + tid * ddecomp->NTopLeaves];
1324  }
1325  }
1326 
1327  if(local_TopLeafWork) {
1328  MPI_Allreduce(local_TopLeafWork, TopLeafWork, ddecomp->NTopLeaves, MPI_INT64, MPI_SUM, ddecomp->DomainComm);
1329  myfree(local_TopLeafWork);
1330  }
1331 
1332  MPI_Allreduce(local_TopLeafCount, TopLeafCount, ddecomp->NTopLeaves, MPI_INT64, MPI_SUM, ddecomp->DomainComm);
1333  myfree(local_TopLeafCount);
1334 }
1335 
1352 static void
1354  struct local_topnode_data *treeB,
1355  int noA, int noB, int * treeASize, const int MaxTopNodes)
1356 {
1357  int j, sub;
1358  int64_t count;
1359  int64_t cost;
1360 
1361  if(treeB[noB].Shift < treeA[noA].Shift)
1362  {
1363  /* Add B to A */
1364  /* Create a daughter to a, since we will merge B to A's daughter*/
1365  if(treeA[noA].Daughter < 0)
1366  {
1367  if((*treeASize + 8) >= MaxTopNodes) {
1368  endrun(88, "Too many Topnodes; this shall not happen because we ensure there is enough and bailed earlier than this\n");
1369  }
1370  /* noB must have a parent if we are here, since noB is lower than noA;
1371  * noA must have already merged in the cost of noB's parent.
1372  * This is the first time we create these children in A, thus,
1373  * We shall evenly divide the non-B part of the cost in these children;
1374  * */
1375 
1376  count = treeA[noA].Count - treeB[treeB[noB].Parent].Count;
1377  cost = treeA[noA].Cost - treeB[treeB[noB].Parent].Cost;
1378 
1379  treeA[noA].Daughter = *treeASize;
1380  for(j = 0; j < 8; j++)
1381  {
1382 
1383  sub = treeA[noA].Daughter + j;
1384  treeA[sub].Shift = treeA[noA].Shift - 3;
1385  treeA[sub].Count = (j + 1) * count / 8 - j * count / 8;
1386  treeA[sub].Cost = (j + 1) * cost / 8 - j * cost / 8;
1387  treeA[sub].Daughter = -1;
1388  treeA[sub].Parent = noA;
1389  treeA[sub].StartKey = treeA[noA].StartKey + j * (1L << treeA[sub].Shift);
1390  }
1391  (*treeASize) += 8;
1392  }
1393 
1394  /* find the sub node in A for me and merge, this would bring noB and sub on the same shift, drop to next case */
1395  sub = treeA[noA].Daughter + ((treeB[noB].StartKey - treeA[noA].StartKey) >> (treeA[noA].Shift - 3));
1396  domain_toptree_merge(treeA, treeB, sub, noB, treeASize, MaxTopNodes);
1397  }
1398  else if(treeB[noB].Shift == treeA[noA].Shift)
1399  {
1400  treeA[noA].Count += treeB[noB].Count;
1401  treeA[noA].Cost += treeB[noB].Cost;
1402 
1403  /* Prefer to go down B; this would trigger the previous case */
1404  if(treeB[noB].Daughter >= 0)
1405  {
1406  for(j = 0; j < 8; j++)
1407  {
1408  sub = treeB[noB].Daughter + j;
1409  if(treeB[sub].Shift >= treeB[noB].Shift) {
1410  endrun(1, "Child node %d has shift %d, parent %d has shift %d. treeB is corrupt. \n",
1411  sub, treeB[sub].Shift, noB, treeB[noB].Shift);
1412  }
1413  domain_toptree_merge(treeA, treeB, noA, sub, treeASize, MaxTopNodes);
1414  }
1415  }
1416  else
1417  {
1418  /* We can't divide by B so we do it for A, this may trigger the next branch, since
1419  * we are lowering A */
1420  if(treeA[noA].Daughter >= 0) {
1421  for(j = 0; j < 8; j++) {
1422  sub = treeA[noA].Daughter + j;
1423  domain_toptree_merge(treeA, treeB, sub, noB, treeASize, MaxTopNodes);
1424  }
1425  }
1426  }
1427  }
1428  else if(treeB[noB].Shift > treeA[noA].Shift)
1429  {
1430  /* Since we only know how to split A, here we simply add a spatial average to A */
1431  uint64_t n = 1L << (treeB[noB].Shift - treeA[noA].Shift);
1432 
1433  if(treeB[noB].Shift - treeA[noA].Shift > 60) {
1434  message(1, "Warning: Refusing to merge two tree nodes of wildly different depth: %d %d;\n ", treeB[noB].Shift, treeA[noA].Shift);
1435  n = 0;
1436  }
1437 
1438  count = treeB[noB].Count;
1439  cost = treeB[noB].Cost;
1440 
1441  if (n > 0) {
1442  /* this is no longer conserving total cost but it should be fine .. */
1443  treeA[noA].Count += count / n;
1444  treeA[noA].Cost += cost / n;
1445 
1446  // message(1, "adding cost to %d %td, %td\n", noA, count / n, cost / n);
1447  if(treeA[noA].Daughter >= 0) {
1448  for(j = 0; j < 8; j++) {
1449  sub = treeA[noA].Daughter + j;
1450  domain_toptree_merge(treeA, treeB, sub, noB, treeASize, MaxTopNodes);
1451  }
1452  }
1453  }
1454  }
1455 }
1456 
1457 static int
1458 order_by_key(const void *a, const void *b)
1459 {
1460  const struct local_particle_data * pa = (const struct local_particle_data *) a;
1461  const struct local_particle_data * pb = (const struct local_particle_data *) b;
1462  if(pa->Key < pb->Key)
1463  return -1;
1464 
1465  if(pa->Key > pb->Key)
1466  return +1;
1467 
1468  return 0;
1469 }
1470 
1471 static void
1472 mp_order_by_key(const void * data, void * radix, void * arg)
1473 {
1474  const struct local_particle_data * pa = (const struct local_particle_data *) data;
1475  ((uint64_t *) radix)[0] = pa->Key;
1476 }
static DomainParams domain_params
Definition: domain.c:43
static void mp_order_by_key(const void *data, void *radix, void *arg)
Definition: domain.c:1472
static int domain_determine_global_toptree(DomainDecompositionPolicy *policy, struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm)
Definition: domain.c:1147
static void domain_toptree_merge(struct local_topnode_data *treeA, struct local_topnode_data *treeB, int noA, int noB, int *treeASize, const int MaxTopNodes)
Definition: domain.c:1353
static int domain_check_for_local_refine_subsample(DomainDecompositionPolicy *policy, struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes)
Definition: domain.c:895
static int order_by_key(const void *a, const void *b)
Definition: domain.c:1458
static int topleaf_ext_order_by_task_and_key(const void *c1, const void *c2)
Definition: domain.c:498
static int domain_toptree_insert(struct local_topnode_data *topTree, peano_t Key, int64_t cost)
Definition: domain.c:749
static int domain_check_memory_bound(const DomainDecomp *ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
Definition: domain.c:426
static void domain_toptree_truncate(struct local_topnode_data *topTree, int *topTreeSize, int64_t countlimit, int64_t costlimit)
Definition: domain.c:865
void domain_free(DomainDecomp *ddecomp)
Definition: domain.c:320
static void domain_toptree_update_cost(struct local_topnode_data *topTree, int start)
Definition: domain.c:796
void domain_decompose_full(DomainDecomp *ddecomp)
Definition: domain.c:155
static void domain_create_topleaves(DomainDecomp *ddecomp, int no, int *next)
Definition: domain.c:721
static int domain_toptree_get_subnode(struct local_topnode_data *topTree, peano_t key)
Definition: domain.c:738
static int domain_toptree_split(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, int i)
Definition: domain.c:760
static int domain_balance(DomainDecomp *ddecomp)
Definition: domain.c:400
static int domain_nonrecursively_combine_topTree(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm)
Definition: domain.c:1056
static void domain_toptree_garbage_collection(struct local_topnode_data *topTree, int start, int *last_free)
Definition: domain.c:839
static int domain_attempt_decompose(DomainDecomp *ddecomp, DomainDecompositionPolicy *policy, const int MaxTopNodes)
Definition: domain.c:338
static void domain_compute_costs(const DomainDecomp *ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
Definition: domain.c:1280
static int domain_layoutfunc(int n, const void *userdata)
Definition: domain.c:707
static void domain_toptree_truncate_r(struct local_topnode_data *topTree, int start, int64_t countlimit, int64_t costlimit)
Definition: domain.c:811
#define TAG_GRAV_B
Definition: domain.c:24
void set_domain_params(ParameterSet *ps)
Definition: domain.c:84
#define TAG_GRAV_A
Definition: domain.c:23
void domain_maintain(DomainDecomp *ddecomp, struct DriftData *drift)
Definition: domain.c:234
static int domain_global_refine(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, int64_t countlimit, int64_t costlimit)
Definition: domain.c:1224
static int topleaf_ext_order_by_key(const void *c1, const void *c2)
Definition: domain.c:510
static int domain_allocate(DomainDecomp *ddecomp, DomainDecompositionPolicy *policy)
Definition: domain.c:285
void set_domain_par(DomainParams dp)
Definition: domain.c:78
static int domain_policies_init(DomainDecompositionPolicy policies[], const int NincreaseAlloc, const int SwitchToGlobal)
Definition: domain.c:252
static void domain_assign_balanced(DomainDecomp *ddecomp, int64_t *cost, const int NsegmentPerTask)
Definition: domain.c:533
static int domain_get_topleaf(const peano_t key, const DomainDecomp *ddecomp)
Definition: domain.h:74
void message(int where, const char *fmt,...)
Definition: endrun.c:175
void endrun(int where, const char *fmt,...)
Definition: endrun.c:147
void domain_test_id_uniqueness(struct part_manager_type *pman)
Definition: exchange.c:570
int domain_exchange(ExchangeLayoutFunc layoutfunc, const void *layout_userdata, int do_gc, struct DriftData *drift, struct part_manager_type *pman, struct slots_manager_type *sman, int maxiter, MPI_Comm Comm)
Definition: exchange.c:103
#define mpsort_mpi(base, nmemb, elsize, radix, rsize, arg, comm)
Definition: mpsort.h:26
#define mymalloc(name, size)
Definition: mymalloc.h:15
#define ta_malloc(name, type, nele)
Definition: mymalloc.h:25
#define ta_free(p)
Definition: mymalloc.h:28
#define mymalloc2(name, size)
Definition: mymalloc.h:16
#define mymalloc_usedbytes()
Definition: mymalloc.h:32
#define myfree(x)
Definition: mymalloc.h:19
#define report_memory_usage(x)
Definition: mymalloc.h:30
double param_get_double(ParameterSet *ps, const char *name)
Definition: paramset.c:336
int param_get_int(ParameterSet *ps, const char *name)
Definition: paramset.c:368
struct part_manager_type PartManager[1]
Definition: partmanager.c:11
#define P
Definition: partmanager.h:88
uint64_t peano_t
Definition: peano.h:7
#define BITS_PER_DIMENSION
Definition: peano.h:9
struct slots_manager_type SlotsManager[1]
Definition: slotsmanager.c:7
void slots_gc_sorted(struct part_manager_type *pman, struct slots_manager_type *sman)
Definition: slotsmanager.c:442
struct topnode_data * TopNodes
Definition: domain.h:36
struct task_data * Tasks
Definition: domain.h:41
MPI_Comm DomainComm
Definition: domain.h:44
int NTopNodes
Definition: domain.h:39
struct topleaf_data * TopLeaves
Definition: domain.h:37
int NTopLeaves
Definition: domain.h:40
int domain_allocated_flag
Definition: domain.h:34
MPI_Comm GlobalSortComm
Definition: domain.c:52
int DomainOverDecompositionFactor
Definition: domain.h:53
int DomainUseGlobalSorting
Definition: domain.h:55
double TopNodeAllocFactor
Definition: domain.h:57
double SetAsideFactor
Definition: domain.h:59
int64_t Cost
Definition: domain.c:74
peano_t StartKey
Definition: domain.c:62
int64_t Count
Definition: domain.c:67
int64_t Cost
Definition: domain.c:68
int EndLeaf
Definition: domain.h:30
int StartLeaf
Definition: domain.h:29
int topnode
Definition: domain.h:23
int Task
Definition: domain.h:21
peano_t Key
Definition: domain.c:491
int64_t cost
Definition: domain.c:494
peano_t StartKey
Definition: domain.h:14
int Leaf
Definition: domain.h:17
int Daughter
Definition: domain.h:15
int Shift
Definition: domain.h:16
int MPIU_Any(int condition, MPI_Comm comm)
Definition: system.c:545
#define MPIU_Barrier(comm)
Definition: system.h:103
#define MPI_INT64
Definition: system.h:12
int ThisTask
Definition: test_exchange.c:23
int NTask
Definition: test_exchange.c:23
#define qsort_openmp
Definition: test_exchange.c:14
#define walltime_measure(name)
Definition: walltime.h:8