87 MPI_Comm_rank(MPI_COMM_WORLD, &
ThisTask);
146 const int NincreaseAlloc,
147 const int SwitchToGlobal);
158 static int Npolicies = 0;
161 static int LastSuccessfulPolicy = 0;
163 if (Npolicies == 0) {
164 const int NincreaseAlloc = 16;
172 int decompose_failed = 1;
174 for(i = LastSuccessfulPolicy; i < Npolicies; i ++)
183 message(0,
"Attempting new domain decomposition policy: TopNodeAllocFactor=%g, UseglobalSort=%d, SubSampleDistance=%d UsePreSort=%d\n",
184 policies[i].TopNodeAllocFactor, policies[i].UseGlobalSort, policies[i].SubSampleDistance, policies[i].PreSort);
192 LastSuccessfulPolicy = i;
197 if(decompose_failed) {
198 endrun(0,
"No suitable domain decomposition policy worked for this particle distribution\n");
217 endrun(1929,
"Could not exchange particles\n");
225 message(0,
"Domain decomposition done.\n");
236 message(0,
"Attempting a domain exchange\n");
253 const int NincreaseAlloc,
254 const int SwitchToGlobal)
257 MPI_Comm_size(MPI_COMM_WORLD, &
NTask);
260 for(i = 0; i < NincreaseAlloc; i ++) {
266 if(i >= SwitchToGlobal)
280 return NincreaseAlloc;
287 size_t bytes, all_bytes = 0;
304 bytes = (MaxTopNodes * (
sizeof(ddecomp->
TopNodes[0]))));
309 bytes = (MaxTopNodes *
sizeof(ddecomp->
TopLeaves[0])));
313 message(0,
"Allocated %g MByte for top-level domain structure\n", all_bytes / (1024.0 * 1024.0));
342 size_t bytes, all_bytes = 0;
347 memset(topTree, 0,
sizeof(topTree[0]) * MaxTopNodes);
350 message(0,
"use of %g MB of temporary storage for domain decomposition... (presently allocated=%g MB)\n",
381 message(0,
"Number of Topleaves is less than required over decomposition");
389 endrun(0,
"Number of Topleaves is less than NTask");
403 int64_t * TopLeafCount = (int64_t *)
mymalloc(
"TopLeafCount", ddecomp->
NTopLeaves *
sizeof(TopLeafCount[0]));
416 message(0,
"Domain decomposition is outside memory bounds.\n");
435 int64_t max_work = 0, max_load = 0, sumload = 0, sumwork = 0;
438 #pragma omp parallel for reduction(+: sumwork) reduction(+: sumload) reduction(max: max_load) reduction(max:max_work)
439 for(ta = 0; ta <
NTask; ta++)
444 for(i = ddecomp->
Tasks[ta].
StartLeaf; i < ddecomp->Tasks[ta].EndLeaf; i ++)
446 load += TopLeafCount[i];
448 work += TopLeafWork[i];
451 list_load[ta] = load;
452 list_work[ta] = work;
464 message(0,
"Largest load: work=%g particle=%g\n",
465 max_work / ((
double)sumwork /
NTask), max_load / (((
double) sumload) /
NTask));
467 message(0,
"Largest particle load particle=%g\n", max_load / (((
double) sumload) /
NTask));
472 message(0,
"desired memory imbalance=%g (limit=%g, needed=%ld)\n",
474 message(0,
"Balance breakdown:\n");
476 for(i = 0; i <
NTask; i++)
478 message(0,
"Task: [%3d] work=%8.4f particle load=%8.4f\n", i,
479 list_work[i] / ((
double) sumwork /
NTask), list_load[i] / (((
double) sumload) /
NTask));
504 if(p1->
Key < p2->
Key)
return -1;
505 if(p1->
Key > p2->
Key)
return 1;
514 if(p1->
Key < p2->
Key)
return -1;
515 if(p1->
Key > p2->
Key)
return 1;
538 int Nsegment =
NTask * NsegmentPerTask;
552 TopLeafExt[i].
Task = -1;
561 int64_t totalcost = 0;
562 #pragma omp parallel for reduction(+ : totalcost)
564 totalcost += TopLeafExt[i].
cost;
566 int64_t totalcostLeft = totalcost;
573 double mean_expected = 1.0 * totalcost / Nsegment;
574 double mean_task = 1.0 * totalcost /
NTask;
580 int64_t curtaskload = 0;
581 int64_t maxleafcost = 0;
583 message(0,
"Expected segment cost %g\n", mean_expected);
587 while(nrounds < ddecomp->NTopLeaves) {
593 }
else if(ddecomp->
NTopLeaves - curleaf == Nsegment - curseg) {
601 int64_t totalassigned = (totalcost - totalcostLeft) + curload;
602 if((mean_expected * (curseg +1) - totalassigned > 0.5 * TopLeafExt[curleaf].
cost)
613 curload += TopLeafExt[curleaf].
cost;
614 TopLeafExt[curleaf].
Task = curtask;
620 curtaskload += curload;
625 if((mean_task - curtaskload < 0.5 * mean_expected) || (Nsegment - curseg <=
NTask - curtask)
631 totalcostLeft -= curload;
636 if(curtask ==
NTask) {
641 mean_expected = 1.0 * totalcostLeft / Nsegment;
642 mean_task = 1.0 * totalcostLeft /
NTask;
647 if(TopLeafExt[curleaf].
cost > maxleafcost)
648 maxleafcost = TopLeafExt[curleaf].
cost;
654 message(0,
"Created %d segments in %d rounds. Max leaf cost: %g\n", curseg, nrounds, (1.0*maxleafcost)/(totalcost) * Nsegment);
656 if(curseg < Nsegment) {
657 endrun(0,
"Not enough segments were created (%d instead of %d). This should not happen.\n", curseg, Nsegment);
660 if(totalcostLeft != 0) {
661 endrun(0,
"Assertion failed. Total cost is not fully assigned to all ranks\n");
685 while(ta < ddecomp->TopLeaves[i].
Task) {
694 endrun(0,
"Assertion failed: not all tasks are assigned. This indicates a bug.\n");
732 for(i = 0; i < 8; i++)
742 while(topTree[no].Daughter >= 0) {
743 no = topTree[no].
Daughter + ((key - topTree[no].
StartKey) >> (topTree[no].Shift - 3));
754 topTree[leaf].
Count ++;
765 if((*topTreeSize + 8) > MaxTopNodes)
768 if(topTree[i].Shift < 3) {
769 endrun(1,
"Failed to build a TopTree -- particles overly clustered.\n");
777 for(j = 0; j < 8; j++)
779 const int sub = topTree[i].
Daughter + j;
789 topTree[sub].
Count = 0;
790 topTree[sub].
Cost = 0;
798 if(topTree[start].Daughter == -1)
return;
801 for(j = 0; j < 8; j ++) {
802 int sub = topTree[start].
Daughter + j;
805 topTree[start].
Cost += topTree[sub].
Cost;
813 if(topTree[start].Daughter == -1)
return;
815 if(topTree[start].Count < countlimit &&
816 topTree[start].Cost < costlimit) {
823 for(j = 0; j < 8; j ++) {
824 int sub = topTree[start].
Daughter + j;
842 if(topTree[start].Daughter == -1)
return;
846 int newd = *last_free;
854 for(j = 0; j < 8; j ++) {
855 topTree[newd + j] = topTree[oldd + j];
856 topTree[newd + j].
Parent = start;
859 for(j = 0; j < 8; j ++) {
867 int64_t countlimit, int64_t costlimit)
924 #pragma omp parallel for
927 LP[i].
Key =
P[i].Key;
938 for(i = 0; i < Nsample; i ++)
957 topTree[0].
Count = 0;
996 if (leaf == last_leaf && topTree[leaf].Shift >= 3) {
1004 topTree[leaf].
Count = 0;
1010 if(topTree[leaf].Count != 0 && leaf != last_leaf) {
1019 endrun(10,
"toptree[%d].Count=%d, shift %d, last_leaf=%d key = %ld i= %d Nsample = %d\n",
1020 leaf, topTree[leaf].Count, topTree[leaf].Shift, last_leaf,LP[i].
Key, i, Nsample);
1023 last_key = LP[i].
Key;
1035 for(i = 0; i < *topTreeSize; i ++ ) {
1036 topTree[i].
Count = 0;
1040 for(i = 0; i < Nsample; i ++ ) {
1071 MPI_Comm_size(DomainComm, &
NTask);
1074 MPI_Comm_rank(DomainComm, &
ThisTask);
1076 for(sep = 1; sep <
NTask; sep *=2) {
1088 if(Color % 2 == 0) {
1091 int ntopnodes_import = 0;
1092 if(recvTask <
NTask) {
1093 MPI_Recv(&ntopnodes_import, 1, MPI_INT, recvTask,
TAG_GRAV_A,
1094 DomainComm, MPI_STATUS_IGNORE);
1095 if(ntopnodes_import < 0) {
1096 endrun(1,
"severe domain error using a unintended rank \n");
1098 int mergesize = ntopnodes_import;
1099 if(ntopnodes_import < *topTreeSize)
1100 mergesize = *topTreeSize;
1104 MPI_Recv(topTree_import,
1107 DomainComm, MPI_STATUS_IGNORE);
1109 if((*topTreeSize + ntopnodes_import) > MaxTopNodes) {
1112 if(ntopnodes_import > 0 ) {
1122 MPI_Send(topTreeSize, 1, MPI_INT, recvTask,
TAG_GRAV_A, DomainComm);
1130 MPI_Bcast(topTreeSize, 1, MPI_INT, 0, DomainComm);
1132 if(*topTreeSize < 0 || *topTreeSize >= MaxTopNodes) {
1135 int errorflagall =
MPIU_Any(errorflag, DomainComm);
1136 if(errorflagall == 0)
1137 MPI_Bcast(topTree, (*topTreeSize) *
sizeof(
struct local_topnode_data), MPI_BYTE, 0, DomainComm);
1138 return errorflagall;
1148 struct local_topnode_data * topTree,
int * topTreeSize,
int MaxTopNodes, MPI_Comm DomainComm)
1161 if(
MPIU_Any(local_refine_failed, DomainComm)) {
1162 message(0,
"We are out of Topnodes. \n");
1166 int64_t TotCost = 0, TotCount = 0;
1167 int64_t costlimit, countlimit;
1169 MPI_Allreduce(&topTree[0].
Cost, &TotCost, 1,
MPI_INT64, MPI_SUM, DomainComm);
1170 MPI_Allreduce(&topTree[0].
Count, &TotCount, 1,
MPI_INT64, MPI_SUM, DomainComm);
1173 countlimit = TotCount / (policy->
NTopLeaves);
1180 message(1,
"local TopTree Size =%d >> expected = %d; Usually this indicates very bad imbalance, due to a giant density peak.\n",
1186 sprintf(buf,
"topnodes.bin.%d",
ThisTask);
1187 FILE * fd = fopen(buf,
"w");
1203 message(0,
"can't combine trees due to lack of storage. Will try again.\n");
1209 int global_refine_failed =
domain_global_refine(topTree, topTreeSize, MaxTopNodes, countlimit, costlimit);
1213 if(
MPIU_Any(global_refine_failed, DomainComm)) {
1214 message(0,
"Global refine failed: toptreeSize = %d, MaxTopNodes = %d\n", *topTreeSize, MaxTopNodes);
1218 message(0,
"Final local topTree size = %d per segment = %g.\n", *topTreeSize, 1.0 * (*topTreeSize) / (policy->
NTopLeaves));
1226 int64_t countlimit, int64_t costlimit)
1245 message(0,
"local topTree size before appending=%d\n", *topTreeSize);
1248 for(i = 0; i < *topTreeSize; i++)
1251 if(topTree[i].
Daughter >= 0 || topTree[i].
Shift <= 0)
continue;
1254 if(topTree[i].
Count < countlimit && topTree[i].
Cost < costlimit)
continue;
1257 if((*topTreeSize + 8) > MaxTopNodes) {
1261 topTree[i].
Daughter = *topTreeSize;
1263 for(j = 0; j < 8; j++)
1268 topTree[sub].
Cost = topTree[i].
Cost / 8;
1273 (*topTreeSize) += 8;
1283 int NumThreads = omp_get_max_threads();
1284 int64_t * local_TopLeafWork = NULL;
1286 local_TopLeafWork = (int64_t *)
mymalloc(
"local_TopLeafWork", NumThreads * ddecomp->
NTopLeaves *
sizeof(local_TopLeafWork[0]));
1287 memset(local_TopLeafWork, 0, NumThreads * ddecomp->
NTopLeaves *
sizeof(local_TopLeafWork[0]));
1289 int64_t * local_TopLeafCount = (int64_t *)
mymalloc(
"local_TopLeafCount", NumThreads * ddecomp->
NTopLeaves *
sizeof(local_TopLeafCount[0]));
1290 memset(local_TopLeafCount, 0, NumThreads * ddecomp->
NTopLeaves *
sizeof(local_TopLeafCount[0]));
1292 #pragma omp parallel
1294 int tid = omp_get_thread_num();
1306 if(local_TopLeafWork)
1307 local_TopLeafWork[no + tid * ddecomp->
NTopLeaves] += 1;
1309 local_TopLeafCount[no + tid * ddecomp->
NTopLeaves] += 1;
1313 #pragma omp parallel for
1317 if(local_TopLeafWork)
1318 for(tid = 1; tid < NumThreads; tid++) {
1319 local_TopLeafWork[i] += local_TopLeafWork[i + tid * ddecomp->
NTopLeaves];
1322 for(tid = 1; tid < NumThreads; tid++) {
1323 local_TopLeafCount[i] += local_TopLeafCount[i + tid * ddecomp->
NTopLeaves];
1327 if(local_TopLeafWork) {
1329 myfree(local_TopLeafWork);
1333 myfree(local_TopLeafCount);
1355 int noA,
int noB,
int * treeASize,
const int MaxTopNodes)
1367 if((*treeASize + 8) >= MaxTopNodes) {
1368 endrun(88,
"Too many Topnodes; this shall not happen because we ensure there is enough and bailed earlier than this\n");
1380 for(j = 0; j < 8; j++)
1385 treeA[sub].
Count = (j + 1) * count / 8 - j * count / 8;
1386 treeA[sub].
Cost = (j + 1) * cost / 8 - j * cost / 8;
1398 else if(treeB[noB].
Shift == treeA[noA].
Shift)
1401 treeA[noA].
Cost += treeB[noB].
Cost;
1406 for(j = 0; j < 8; j++)
1410 endrun(1,
"Child node %d has shift %d, parent %d has shift %d. treeB is corrupt. \n",
1411 sub, treeB[sub].
Shift, noB, treeB[noB].
Shift);
1421 for(j = 0; j < 8; j++) {
1431 uint64_t n = 1L << (treeB[noB].
Shift - treeA[noA].
Shift);
1433 if(treeB[noB].
Shift - treeA[noA].
Shift > 60) {
1434 message(1,
"Warning: Refusing to merge two tree nodes of wildly different depth: %d %d;\n ", treeB[noB].
Shift, treeA[noA].
Shift);
1438 count = treeB[noB].
Count;
1439 cost = treeB[noB].
Cost;
1443 treeA[noA].
Count += count / n;
1444 treeA[noA].
Cost += cost / n;
1448 for(j = 0; j < 8; j++) {
1475 ((uint64_t *) radix)[0] = pa->
Key;
static DomainParams domain_params
static void mp_order_by_key(const void *data, void *radix, void *arg)
static int domain_determine_global_toptree(DomainDecompositionPolicy *policy, struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm)
static void domain_toptree_merge(struct local_topnode_data *treeA, struct local_topnode_data *treeB, int noA, int noB, int *treeASize, const int MaxTopNodes)
static int domain_check_for_local_refine_subsample(DomainDecompositionPolicy *policy, struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes)
static int order_by_key(const void *a, const void *b)
static int topleaf_ext_order_by_task_and_key(const void *c1, const void *c2)
static int domain_toptree_insert(struct local_topnode_data *topTree, peano_t Key, int64_t cost)
static int domain_check_memory_bound(const DomainDecomp *ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
static void domain_toptree_truncate(struct local_topnode_data *topTree, int *topTreeSize, int64_t countlimit, int64_t costlimit)
void domain_free(DomainDecomp *ddecomp)
static void domain_toptree_update_cost(struct local_topnode_data *topTree, int start)
void domain_decompose_full(DomainDecomp *ddecomp)
static void domain_create_topleaves(DomainDecomp *ddecomp, int no, int *next)
static int domain_toptree_get_subnode(struct local_topnode_data *topTree, peano_t key)
static int domain_toptree_split(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, int i)
static int domain_balance(DomainDecomp *ddecomp)
static int domain_nonrecursively_combine_topTree(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, MPI_Comm DomainComm)
static void domain_toptree_garbage_collection(struct local_topnode_data *topTree, int start, int *last_free)
static int domain_attempt_decompose(DomainDecomp *ddecomp, DomainDecompositionPolicy *policy, const int MaxTopNodes)
static void domain_compute_costs(const DomainDecomp *ddecomp, int64_t *TopLeafWork, int64_t *TopLeafCount)
static int domain_layoutfunc(int n, const void *userdata)
static void domain_toptree_truncate_r(struct local_topnode_data *topTree, int start, int64_t countlimit, int64_t costlimit)
void set_domain_params(ParameterSet *ps)
void domain_maintain(DomainDecomp *ddecomp, struct DriftData *drift)
static int domain_global_refine(struct local_topnode_data *topTree, int *topTreeSize, const int MaxTopNodes, int64_t countlimit, int64_t costlimit)
static int topleaf_ext_order_by_key(const void *c1, const void *c2)
static int domain_allocate(DomainDecomp *ddecomp, DomainDecompositionPolicy *policy)
void set_domain_par(DomainParams dp)
static int domain_policies_init(DomainDecompositionPolicy policies[], const int NincreaseAlloc, const int SwitchToGlobal)
static void domain_assign_balanced(DomainDecomp *ddecomp, int64_t *cost, const int NsegmentPerTask)
static int domain_get_topleaf(const peano_t key, const DomainDecomp *ddecomp)
void message(int where, const char *fmt,...)
void endrun(int where, const char *fmt,...)
void domain_test_id_uniqueness(struct part_manager_type *pman)
int domain_exchange(ExchangeLayoutFunc layoutfunc, const void *layout_userdata, int do_gc, struct DriftData *drift, struct part_manager_type *pman, struct slots_manager_type *sman, int maxiter, MPI_Comm Comm)
#define mpsort_mpi(base, nmemb, elsize, radix, rsize, arg, comm)
#define mymalloc(name, size)
#define ta_malloc(name, type, nele)
#define mymalloc2(name, size)
#define mymalloc_usedbytes()
#define report_memory_usage(x)
double param_get_double(ParameterSet *ps, const char *name)
int param_get_int(ParameterSet *ps, const char *name)
struct part_manager_type PartManager[1]
#define BITS_PER_DIMENSION
struct slots_manager_type SlotsManager[1]
void slots_gc_sorted(struct part_manager_type *pman, struct slots_manager_type *sman)
struct topnode_data * TopNodes
struct topleaf_data * TopLeaves
int domain_allocated_flag
double TopNodeAllocFactor
int DomainOverDecompositionFactor
int DomainUseGlobalSorting
double TopNodeAllocFactor
int MPIU_Any(int condition, MPI_Comm comm)
#define MPIU_Barrier(comm)
#define walltime_measure(name)