MP-Gadget  5.0.1.dev1-76bc7d4726-dirty
forcetree.c
Go to the documentation of this file.
1 #include <mpi.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 #include <time.h>
7 #include <omp.h>
8 
9 #include "slotsmanager.h"
10 #include "partmanager.h"
11 #include "domain.h"
12 #include "forcetree.h"
13 #include "checkpoint.h"
14 #include "walltime.h"
15 #include "utils/endrun.h"
16 
31 static struct forcetree_params
32 {
33  /* Each processor allocates a number of nodes which is TreeAllocFactor times
34  the maximum(!) number of particles. Note: A typical local tree for N
35  particles needs usually about ~0.65*N nodes.
36  If the allocated memory is not sufficient, this parameter will be increased.*/
41 
42 void
43 init_forcetree_params(const int FastParticleType)
44 {
45  /* This was increased due to the extra nodes created by subtrees*/
47  ForceTreeParams.FastParticleType = FastParticleType;
48 }
49 
50 static ForceTree
51 force_tree_build(int npart, int mask, DomainDecomp * ddecomp, const int HybridNuGrav, const int DoMoments, const char * EmergencyOutputDir);
52 
53 static void
54 force_treeupdate_pseudos(int no, const ForceTree * tree);
55 
56 static void
57 force_create_node_for_topnode(int no, int topnode, struct NODE * Nodes, const DomainDecomp * ddecomp, int bits, int x, int y, int z, int *nextfree, const int lastnode);
58 
59 static void
60 force_exchange_pseudodata(ForceTree * tree, const DomainDecomp * ddecomp);
61 
62 static void
63 force_insert_pseudo_particles(const ForceTree * tree, const DomainDecomp * ddecomp);
64 
65 static void
66 add_particle_moment_to_node(struct NODE * pnode, int i);
67 
68 #ifdef DEBUG
69 /* Walk the constructed tree, validating sibling and nextnode as we go*/
70 static void force_validate_nextlist(const ForceTree * tree)
71 {
72  int no = tree->firstnode;
73  while(no != -1)
74  {
75  struct NODE * current = &tree->Nodes[no];
76  if(current->sibling != -1 && !node_is_node(current->sibling, tree))
77  endrun(5, "Node %d (type %d) has sibling %d next %d father %d first %d final %d last %d ntop %d\n", no, current->f.ChildType, current->sibling, current->s.suns[0], current->father, tree->firstnode, tree->firstnode + tree->numnodes, tree->lastnode, tree->NTopLeaves);
78 
79  if(current->f.ChildType == PSEUDO_NODE_TYPE) {
80  /* pseudo particle: nextnode should be a pseudo particle, sibling should be a node. */
81  if(!node_is_pseudo_particle(current->s.suns[0], tree))
82  endrun(5, "Pseudo Node %d has next node %d sibling %d father %d first %d final %d last %d ntop %d\n", no, current->s.suns[0], current->sibling, current->father, tree->firstnode, tree->firstnode + tree->numnodes, tree->lastnode, tree->NTopLeaves);
83  }
84  else if(current->f.ChildType == NODE_NODE_TYPE) {
85  /* Next node should be another node */
86  if(!node_is_node(current->s.suns[0], tree))
87  endrun(5, "Node Node %d has next node which is particle %d sibling %d father %d first %d final %d last %d ntop %d\n", no, current->s.suns[0], current->sibling, current->father, tree->firstnode, tree->firstnode + tree->numnodes, tree->lastnode, tree->NTopLeaves);
88  no = current->s.suns[0];
89  continue;
90  }
91  no = current->sibling;
92  }
93  /* Every node should have a valid father: collect those that do not.*/
94  for(no = tree->firstnode; no < tree->firstnode + tree->numnodes; no++)
95  {
96  if(!node_is_node(tree->Nodes[no].father, tree) && tree->Nodes[no].father >= 0) {
97  struct NODE *current = &tree->Nodes[no];
98  message(1, "Danger! no %d has father %d, next %d sib %d, (ptype = %d) len %g center (%g %g %g) mass %g cofm %g %g %g TL %d DLM %d ITL %d nocc %d suns %d %d %d %d\n", no, current->father, current->s.suns[0], current->sibling, current->f.ChildType,
99  current->len, current->center[0], current->center[1], current->center[2],
100  current->mom.mass, current->mom.cofm[0], current->mom.cofm[1], current->mom.cofm[2],
101  current->f.TopLevel, current->f.DependsOnLocalMass, current->f.InternalTopLevel, current->s.noccupied,
102  current->s.suns[0], current->s.suns[1], current->s.suns[2], current->s.suns[3]);
103  }
104  }
105 
106 }
107 #endif
108 
109 static int
110 force_tree_eh_slots_fork(EIBase * event, void * userdata)
111 {
112  /* after a fork, we will attach the new particle to the force tree. */
113  EISlotsFork * ev = (EISlotsFork*) event;
114  int parent = ev->parent;
115  int child = ev->child;
116  ForceTree * tree = (ForceTree * ) userdata;
117  int no = force_get_father(parent, tree);
118  struct NODE * nop = &tree->Nodes[no];
119  /* FIXME: We lose particles if the node is full.
120  * At the moment this does not matter, because
121  * the only new particles are stars, which do not
122  * participate in the SPH tree walk.*/
123  if(nop->s.noccupied < NMAXCHILD) {
124  nop->s.suns[nop->s.noccupied] = child;
125  nop->s.Types += P[child].Type << (3*nop->s.noccupied);
126  nop->s.noccupied++;
127  }
128  if(child < tree->nfather && child >= 0)
129  tree->Father[child] = no;
130  return 0;
131 }
132 
133 int
135 {
136  return tree->tree_allocated_flag;
137 }
138 
139 void
140 force_tree_rebuild(ForceTree * tree, DomainDecomp * ddecomp, const int HybridNuGrav, const int DoMoments, const char * EmergencyOutputDir)
141 {
142  MPIU_Barrier(MPI_COMM_WORLD);
143  message(0, "Tree construction. (presently allocated=%g MB)\n", mymalloc_usedbytes() / (1024.0 * 1024.0));
144 
145  if(force_tree_allocated(tree)) {
146  force_tree_free(tree);
147  }
148  walltime_measure("/Misc");
149 
150  *tree = force_tree_build(PartManager->NumPart, ALLMASK, ddecomp, HybridNuGrav, DoMoments, EmergencyOutputDir);
151 
153  walltime_measure("/Tree/Build/Moments");
154 
155  message(0, "Tree constructed (moments: %d). First node %d, number of nodes %d, first pseudo %d. NTopLeaves %d\n",
156  tree->moments_computed_flag, tree->firstnode, tree->numnodes, tree->lastnode, tree->NTopLeaves);
157  MPIU_Barrier(MPI_COMM_WORLD);
158 }
159 
160 void
161 force_tree_rebuild_mask(ForceTree * tree, DomainDecomp * ddecomp, int mask, const int HybridNuGrav, const char * EmergencyOutputDir)
162 {
163  MPIU_Barrier(MPI_COMM_WORLD);
164  message(0, "Tree construction for types: %d.\n", mask);
165 
166  if(force_tree_allocated(tree)) {
167  force_tree_free(tree);
168  }
169  walltime_measure("/Misc");
170 
171  /* No moments*/
172  *tree = force_tree_build(PartManager->NumPart, mask, ddecomp, HybridNuGrav, 0, EmergencyOutputDir);
173  walltime_measure("/SPH/Build");
174 
175  message(0, "Tree constructed (type mask: %d). First node %d, number of nodes %d, first pseudo %d. NTopLeaves %d\n",
176  mask, tree->firstnode, tree->numnodes, tree->lastnode, tree->NTopLeaves);
177  MPIU_Barrier(MPI_COMM_WORLD);
178 }
179 
191 ForceTree force_tree_build(int npart, int mask, DomainDecomp * ddecomp, const int HybridNuGrav, const int DoMoments, const char * EmergencyOutputDir)
192 {
193  ForceTree tree;
194 
195  int TooManyNodes = 0;
196  int64_t maxnodes = ForceTreeParams.TreeAllocFactor * PartManager->NumPart + ddecomp->NTopNodes;
197  int64_t maxmaxnodes;
198  MPI_Reduce(&maxnodes, &maxmaxnodes, 1, MPI_INT64, MPI_MAX,0, MPI_COMM_WORLD);
199  message(0, "Treebuild: Largest is %g MByte for %ld tree nodes. firstnode %ld. (presently allocated %g MB)\n",
200  maxmaxnodes * sizeof(struct NODE) / (1024.0 * 1024.0), maxmaxnodes, PartManager->MaxPart,
201  mymalloc_usedbytes() / (1024.0 * 1024.0));
202 
203  do
204  {
205  /* Allocate memory. */
206  tree = force_treeallocate(maxnodes, PartManager->MaxPart, ddecomp);
207 
208  tree.BoxSize = PartManager->BoxSize;
209  tree.numnodes = force_tree_create_nodes(tree, npart, mask, ddecomp, HybridNuGrav);
210  if(tree.numnodes >= tree.lastnode - tree.firstnode)
211  {
212  message(1, "Not enough tree nodes (%ld) for %d particles. Created %d\n", maxnodes, npart, tree.numnodes);
213  force_tree_free(&tree);
215  message(1, "TreeAllocFactor from %g to %g now %ld tree nodes\n", ForceTreeParams.TreeAllocFactor, ForceTreeParams.TreeAllocFactor*1.15, maxnodes);
217  if(ForceTreeParams.TreeAllocFactor > 3.0) {
218  TooManyNodes = 1;
219  break;
220  }
221  }
222  }
223  while(tree.numnodes >= tree.lastnode - tree.firstnode);
224 
225  tree.mask = mask;
226 
227  if(MPIU_Any(TooManyNodes, MPI_COMM_WORLD)) {
228  /* Assume scale factor = 1 for dump as position is not affected.*/
229  if(EmergencyOutputDir) {
230  Cosmology CP = {0};
231  CP.Omega0 = 0.3;
232  CP.OmegaLambda = 0.7;
233  CP.HubbleParam = 0.7;
234  dump_snapshot("FORCETREE-DUMP", 1, &CP, EmergencyOutputDir);
235  }
236  endrun(2, "Required too many nodes, snapshot dumped\n");
237  }
238  if(mask == ALLMASK)
239  walltime_measure("/Tree/Build/Nodes");
240 #ifdef DEBUG
241  force_validate_nextlist(&tree);
242 #endif
243  /* insert the pseudo particles that represent the mass distribution of other ddecomps */
244  force_insert_pseudo_particles(&tree, ddecomp);
245 #ifdef DEBUG
246  force_validate_nextlist(&tree);
247 #endif
248 
249  tree.moments_computed_flag = 0;
250 
251  if(DoMoments) {
252  /* now compute the multipole moments recursively */
253  force_update_node_parallel(&tree, ddecomp);
254 #ifdef DEBUG
255  force_validate_nextlist(&tree);
256 #endif
257 
258  /* Exchange the pseudo-data*/
259  force_exchange_pseudodata(&tree, ddecomp);
260 
262  tree.moments_computed_flag = 1;
263  tree.hmax_computed_flag = 1;
264  }
265  tree.Nodes_base = (struct NODE *) myrealloc(tree.Nodes_base, (tree.numnodes +1) * sizeof(struct NODE));
266 
267  /*Update the oct-tree struct so it knows about the memory change*/
268  tree.Nodes = tree.Nodes_base - tree.firstnode;
269 #ifdef DEBUG
270  force_validate_nextlist(&tree);
271 #endif
272  return tree;
273 }
274 
275 /* Get the subnode for a given particle and parent node.
276  * This splits a parent node into 8 subregions depending on the particle position.
277  * node is the parent node to split, p_i is the index of the particle we
278  * are currently inserting
279  * Returns a value between 0 and 7.
280  * */
281 int get_subnode(const struct NODE * node, const int p_i)
282 {
283  /*Loop is unrolled to help out the compiler,which normally only manages it at -O3*/
284  return (P[p_i].Pos[0] > node->center[0]) +
285  ((P[p_i].Pos[1] > node->center[1]) << 1) +
286  ((P[p_i].Pos[2] > node->center[2]) << 2);
287 }
288 
289 /*Check whether a particle is inside the volume covered by a node,
290  * by checking whether each dimension is close enough to center (L1 metric).*/
291 static inline int inside_node(const struct NODE * node, const int p_i)
292 {
293  /*One can also use a loop, but the compiler unrolls it only at -O3,
294  *so this is a little faster*/
295  int inside =
296  (fabs(2*(P[p_i].Pos[0] - node->center[0])) <= node->len) *
297  (fabs(2*(P[p_i].Pos[1] - node->center[1])) <= node->len) *
298  (fabs(2*(P[p_i].Pos[2] - node->center[2])) <= node->len);
299  return inside;
300 }
301 
302 /*Initialise an internal node at nfreep. The parent is assumed to be locked, and
303  * we have assured that nothing else will change nfreep while we are here.*/
304 static void init_internal_node(struct NODE *nfreep, struct NODE *parent, int subnode)
305 {
306  int j;
307  const MyFloat lenhalf = 0.25 * parent->len;
308  nfreep->len = 0.5 * parent->len;
309  nfreep->sibling = -10;
310  nfreep->father = -10;
311  nfreep->f.TopLevel = 0;
312  nfreep->f.InternalTopLevel = 0;
313  nfreep->f.DependsOnLocalMass = 0;
314  nfreep->f.ChildType = PARTICLE_NODE_TYPE;
315  nfreep->f.unused = 0;
316 
317  for(j = 0; j < 3; j++) {
318  /* Detect which quadrant we are in by testing the bits of subnode:
319  * if (subnode & [1,2,4]) is true we add lenhalf, otherwise subtract lenhalf*/
320  const int sign = (subnode & (1 << j)) ? 1 : -1;
321  nfreep->center[j] = parent->center[j] + sign*lenhalf;
322  }
323  for(j = 0; j < NMAXCHILD; j++)
324  nfreep->s.suns[j] = -1;
325  nfreep->s.noccupied = 0;
326  nfreep->s.Types = 0;
327  memset(&(nfreep->mom.cofm),0,3*sizeof(MyFloat));
328  nfreep->mom.mass = 0;
329  nfreep->mom.hmax = 0;
330 }
331 
332 /* Size of the free Node thread cache.
333  * 12 8-node rows (works out at 8kB) was found
334  * to be optimal for an Intel skylake and
335  * an AMD Zen2 with 12 threads.*/
336 #define NODECACHE_SIZE (8*12)
337 
338 /*Structure containing thread-local parameters of the tree build*/
339 struct NodeCache {
342 };
343 
344 /*Get a pointer to memory for 8 free nodes, from our node cache. */
345 int get_freenode(int * nnext, struct NodeCache *nc)
346 {
347  /*Get memory for an extra node from our cache.*/
348  if(nc->nrem_thread < 8) {
351  }
352  const int ninsert = nc->nnext_thread;
353  nc->nnext_thread += 8;
354  nc->nrem_thread -= 8;
355  return ninsert;
356 }
357 
358 /* Add a particle to a node in a known empty location.
359  * Parent is assumed to be locked.*/
360 static int
361 modify_internal_node(int parent, int subnode, int p_toplace, const ForceTree tb, const int HybridNuGrav)
362 {
363  tb.Father[p_toplace] = parent;
364  tb.Nodes[parent].s.suns[subnode] = p_toplace;
365  /* Encode the type in the Types array*/
366  tb.Nodes[parent].s.Types += P[p_toplace].Type << (3*subnode);
367  if(!HybridNuGrav || P[p_toplace].Type != ForceTreeParams.FastParticleType)
368  add_particle_moment_to_node(&tb.Nodes[parent], p_toplace);
369  return 0;
370 }
371 
372 
373 /* Create a new layer of nodes beneath the current node, and place the particle.
374  * Must have node lock.*/
375 static int
376 create_new_node_layer(int firstparent, int p_toplace,
377  const int HybridNuGrav, const ForceTree tb, int *nnext, struct NodeCache *nc)
378 {
379  /* This is so we can defer changing
380  * the type of the existing node until the end.*/
381  int parent = firstparent;
382 
383  do {
384  int i;
385  struct NODE *nprnt = &tb.Nodes[parent];
386 
387  /* Braces to scope oldsuns and newsuns*/
388  {
389  int newsuns[NMAXCHILD];
390 
391  int * oldsuns = nprnt->s.suns;
392 
393  /*We have two particles here, so create a new child node to store them both.*/
394  /* if we are here the node must be large enough, thus contain exactly one child. */
395  /* The parent is already a leaf, need to split */
396  /* Get memory for 8 extra nodes from our cache.*/
397  newsuns[0] = get_freenode(nnext, nc);
398  /*If we already have too many nodes, exit loop.*/
399  if(nc->nnext_thread >= tb.lastnode) {
400  /* This means that we have > NMAXCHILD particles in the same place,
401  * which usually indicates a bug in the particle evolution. Print some helpful debug information.*/
402  message(1, "Failed placing %d at %g %g %g, type %d, ID %ld. Others were %d (%g %g %g, t %d ID %ld) and %d (%g %g %g, t %d ID %ld). next %d last %d\n",
403  p_toplace, P[p_toplace].Pos[0], P[p_toplace].Pos[1], P[p_toplace].Pos[2], P[p_toplace].Type, P[p_toplace].ID,
404  oldsuns[0], P[oldsuns[0]].Pos[0], P[oldsuns[0]].Pos[1], P[oldsuns[0]].Pos[2], P[oldsuns[0]].Type, P[oldsuns[0]].ID,
405  oldsuns[1], P[oldsuns[1]].Pos[0], P[oldsuns[1]].Pos[1], P[oldsuns[1]].Pos[2], P[oldsuns[1]].Type, P[oldsuns[1]].ID
406  );
407  nc->nnext_thread = tb.lastnode + 10 * NODECACHE_SIZE;
408  /* If this is not the first layer created,
409  * we need to mark the overall parent as a node node
410  * while marking this one as a particle node */
411  if(firstparent != parent)
412  {
413  nprnt->f.ChildType = PARTICLE_NODE_TYPE;
414  nprnt->s.noccupied = NMAXCHILD;
415  tb.Nodes[firstparent].f.ChildType = NODE_NODE_TYPE;
416  tb.Nodes[firstparent].s.noccupied = (1<<16);
417  }
418  return 1;
419  }
420  for(i=0; i<8; i++) {
421  newsuns[i] = newsuns[0] + i;
422  struct NODE *nfreep = &tb.Nodes[newsuns[i]];
423  /* We create a new leaf node.*/
424  init_internal_node(nfreep, nprnt, i);
425  /*Set father of new node*/
426  nfreep->father = parent;
427  }
428  /*Initialize the remaining entries to empty*/
429  for(i=8; i<NMAXCHILD;i++)
430  newsuns[i] = -1;
431 
432  for(i=0; i < NMAXCHILD; i++) {
433  /* Re-attach each particle to the appropriate new leaf.
434  * Notice that since we have NMAXCHILD slots on each child and NMAXCHILD particles,
435  * we will always have a free slot. */
436  int subnode = get_subnode(nprnt, oldsuns[i]);
437  int child = newsuns[subnode];
438  struct NODE * nchild = &tb.Nodes[child];
439  modify_internal_node(child, nchild->s.noccupied, oldsuns[i], tb, HybridNuGrav);
440  nchild->s.noccupied++;
441  }
442  /* Copy the new node array into the node*/
443  memcpy(nprnt->s.suns, newsuns, NMAXCHILD * sizeof(int));
444  } /* After this brace oldsuns and newsuns are invalid*/
445 
446  /* Set sibling for the new rank. Since empty at this point, point onwards.*/
447  for(i=0; i<7; i++) {
448  int child = nprnt->s.suns[i];
449  struct NODE * nchild = &tb.Nodes[child];
450  nchild->sibling = nprnt->s.suns[i+1];
451  }
452  /* Final child needs special handling: set to the parent's sibling.*/
453  tb.Nodes[nprnt->s.suns[7]].sibling = nprnt->sibling;
454  /* Zero the momenta for the parent*/
455  memset(&nprnt->mom, 0, sizeof(nprnt->mom));
456 
457  /* Now try again to add the new particle*/
458  int subnode = get_subnode(nprnt, p_toplace);
459  int child = nprnt->s.suns[subnode];
460  struct NODE * nchild = &tb.Nodes[child];
461  if(nchild->s.noccupied < NMAXCHILD) {
462  modify_internal_node(child, nchild->s.noccupied, p_toplace, tb, HybridNuGrav);
463  nchild->s.noccupied++;
464  break;
465  }
466  /* The attached particles are already within one subnode of the new node.
467  * Iterate, creating a new layer beneath.*/
468  else {
469  /* The current child is going to have new nodes created beneath it,
470  * so mark it a Node-containing node. It cannot be accessed until
471  * we mark the top-level parent, so no need for atomics.*/
472  tb.Nodes[child].f.ChildType = NODE_NODE_TYPE;
473  tb.Nodes[child].s.noccupied = (1<<16);
474  parent = child;
475  }
476  } while(1);
477 
478  /* A new node is created. Mark the (original) parent as an internal node with node children.
479  * This goes last so that we don't access the child before it is constructed.*/
480  tb.Nodes[firstparent].f.ChildType = NODE_NODE_TYPE;
481  tb.Nodes[firstparent].s.noccupied = (1<<16);
482  return 0;
483 }
484 
485 /* Add a particle to the tree, extending the tree as necessary. Locking is done,
486  * so may be called from a threaded context*/
487 int add_particle_to_tree(int i, int cur_start, const ForceTree tb, const int HybridNuGrav, struct NodeCache *nc, int* nnext)
488 {
489  int child, nocc;
490  int cur = cur_start;
491  /*Walk the main tree until we get something that isn't an internal node.*/
492  do
493  {
494  /*No lock needed: if we have an internal node here it will be stable*/
495  nocc = tb.Nodes[cur].s.noccupied;
496 
497  /* This node still has space for a particle (or needs conversion)*/
498  if(nocc < (1 << 16))
499  break;
500 
501  /* This node has child subnodes: find them.*/
502  int subnode = get_subnode(&tb.Nodes[cur], i);
503  /*No lock needed: if we have an internal node here it will be stable*/
504  child = tb.Nodes[cur].s.suns[subnode];
505 
506  if(child > tb.lastnode || child < tb.firstnode)
507  endrun(1,"Corruption in tree build: N[%d].[%d] = %d > lastnode (%d)\n",cur, subnode, child, tb.lastnode);
508  cur = child;
509  }
510  while(child >= tb.firstnode);
511 
512  /* We have a guaranteed spot.*/
513  nocc = tb.Nodes[cur].s.noccupied;
514  tb.Nodes[cur].s.noccupied++;
515 
516  /* Now we have something that isn't an internal node. We can place the particle! */
517  if(nocc < NMAXCHILD)
518  modify_internal_node(cur, nocc, i, tb, HybridNuGrav);
519  /* In this case we need to create a new layer of nodes beneath this one*/
520  else if(nocc < 1<<16) {
521  if(create_new_node_layer(cur, i, HybridNuGrav, tb, nnext, nc))
522  return -1;
523  } else
524  endrun(2, "Tried to convert already converted node %d with nocc = %d\n", cur, nocc);
525  return cur;
526 }
527 
528 /* Merge two partial trees together. Trees are walked simultaneously.
529  * A merge is done when a particle node is encountered in one of the side trees.
530  * The merge rule is that the node node is attached to the
531  * left-most old parent and the particles are re-attached to the node node*/
532 int
533 merge_partial_force_trees(int left, int right, struct NodeCache * nc, int * nnext, const struct ForceTree tb, int HybridNuGrav)
534 {
535  int this_left = left;
536  int this_right = right;
537  const int left_end = tb.Nodes[left].sibling;
538  const int right_end = tb.Nodes[right].sibling;
539 // message(5, "Ends: %d %d\n", left_end, right_end);
540  while(this_left != left_end && this_right != right_end)
541  {
542  if(this_left < tb.firstnode || this_right < tb.firstnode)
543  endrun(10, "Encountered invalid node: %d %d < first %d\n", this_left, this_right, tb.firstnode);
544  struct NODE * nleft = &tb.Nodes[this_left];
545  struct NODE * nright = &tb.Nodes[this_right];
546  if(nc->nnext_thread >= tb.lastnode)
547  return 1;
548 #ifdef DEBUG
549  /* Stop when we reach another topnode*/
550  if((nleft->f.TopLevel && this_left != left) || (nright->f.TopLevel && this_right != right))
551  endrun(6, "Encountered another topnode: left %d == right %d! type %d\n", this_left, this_right, nleft->f.ChildType);
552  if(this_left == this_right)
553  endrun(6, "Odd: left %d == right %d! type %d\n", this_left, this_right, nleft->f.ChildType);
554 // message(1, "left %d right %d\n", this_left, this_right);
555  /* Trees should be synced*/
556  if(fabs(nleft->len / nright->len-1) > 1e-6)
557  endrun(6, "Merge unsynced trees: %d %d len %g %g\n", this_left, this_right, nleft->len, nright->len);
558 #endif
559  /* Two node nodes: keep walking down*/
560  if(nleft->f.ChildType == NODE_NODE_TYPE && nright->f.ChildType == NODE_NODE_TYPE) {
561  if(tb.Nodes[nleft->s.suns[0]].father < 0 || tb.Nodes[nright->s.suns[0]].father < 0)
562  endrun(7, "Walking to nodes (%d %d) from (%d %d) fathers (%d %d)\n",
563  nleft->s.suns[0], nright->s.suns[0], this_left, this_right, tb.Nodes[nleft->s.suns[0]].father, tb.Nodes[nright->s.suns[0]].father);
564  this_left = nleft->s.suns[0];
565  this_right = nright->s.suns[0];
566  continue;
567  }
568  /* If the right node has particles, add them to the left node, go to sibling on right and left.*/
569  else if(nright->f.ChildType == PARTICLE_NODE_TYPE) {
570  int i;
571  for(i = 0; i < nright->s.noccupied; i++) {
572  if(nright->s.suns[i] >= tb.firstnode)
573  endrun(8, "Bad child %d of %d\n", i, nright->s.suns[i], this_right);
574  if(add_particle_to_tree(nright->s.suns[i], this_left, tb, HybridNuGrav, nc, nnext) < 0)
575  return 1;
576  }
577  /* Make sure that nodes which have
578  * this_right as a sibling (there will
579  * be a max of one, as it is a particle node
580  * with no children) point to the replacement
581  * on the left*/
582  /* This condition is checking for the root node, which has no siblings*/
583  if(this_right > right) {
584  /* Find the father, then the next child*/
585  struct NODE * fat = &tb.Nodes[nright->father];
586  /* Find the position of this child in the father*/
587  int sunloc = 0;
588  for(i = 0; i < 8; i++)
589  {
590  if(fat->s.suns[i] == this_right) {
591  sunloc = i;
592  break;
593  }
594  }
595  /* Change the sibling of the child next to this one*/
596  if(sunloc > 0) {
597  if(tb.Nodes[fat->s.suns[i-1]].sibling == this_right)
598  tb.Nodes[fat->s.suns[i-1]].sibling = this_left;
599  }
600  }
601  /* Mark the right node as now invalid*/
602  nright->father = -5;
603  /* Now go to sibling*/
604  this_left = nleft->sibling;
605  this_right = nright->sibling;
606  continue;
607  }
608  /* If the left node has particles, add them to the right node,
609  * then copy the right node over the left node and go to (old) sibling on right and left.*/
610  else if(nleft->f.ChildType == PARTICLE_NODE_TYPE && nright->f.ChildType == NODE_NODE_TYPE) {
611  /* Add the left particles to the right*/
612  int i;
613  for(i = 0; i < nleft->s.noccupied; i++) {
614  if(nleft->s.suns[i] >= tb.firstnode)
615  endrun(8, "Bad child %d of %d\n", i, nleft->s.suns[i], this_left);
616  if(add_particle_to_tree(nleft->s.suns[i], this_right, tb, HybridNuGrav, nc, nnext) < 0)
617  return 1;
618  }
619  /* Copy the right node over the left*/
620  memmove(&nleft->s, &nright->s, sizeof(nleft->s));
621  nleft->f.ChildType = NODE_NODE_TYPE;
622  /* Zero the momenta for the parent*/
623  memset(&nleft->mom, 0, sizeof(nleft->mom));
624  /* Reset children to the new parent:
625  * this assumes nright is a NODE NODE*/
626  for(i = 0; i < 8; i++) {
627  int child = nleft->s.suns[i];
628  tb.Nodes[child].father = this_left;
629  }
630  /* Make sure final child points to the parent's sibling.*/
631 #ifdef DEBUG
632  int oldsib = tb.Nodes[nleft->s.suns[7]].sibling;
633 #endif
634  /* Walk downwards making sure all the children point to the new sibling.
635  * Note also changes last particle node child. */
636  int nn = this_left;
637  while(tb.Nodes[nn].f.ChildType == NODE_NODE_TYPE) {
638  nn = tb.Nodes[nn].s.suns[7];
639 #ifdef DEBUG
640  if(tb.Nodes[nn].sibling != oldsib)
641  endrun(20, "Not the expected sibling %d != %d\n",tb.Nodes[nn].sibling, oldsib);
642 #endif
643  tb.Nodes[nn].sibling = nleft->sibling;
644  }
645  /* Mark the right node as now invalid*/
646  nright->father = -5;
647  /* Next iteration is going to sibling*/
648  this_left = nleft->sibling;
649  this_right = nright->sibling;
650  continue;
651  }
652  else
653  endrun(6, "Nodes %d %d have unexpected type %d %d\n", this_left, this_right, nleft->f.ChildType, nright->f.ChildType);
654  }
655  return 0;
656 }
657 
661 int force_tree_create_nodes(const ForceTree tb, const int npart, int mask, DomainDecomp * ddecomp, const int HybridNuGrav)
662 {
663  int nnext = tb.firstnode; /* index of first free node */
664 
665  /* create an empty root node */
666  {
667  int i;
668  struct NODE *nfreep = &tb.Nodes[nnext]; /* select first node */
669 
670  nfreep->len = PartManager->BoxSize*1.001;
671  for(i = 0; i < 3; i++)
672  nfreep->center[i] = PartManager->BoxSize/2.;
673  for(i = 0; i < NMAXCHILD; i++)
674  nfreep->s.suns[i] = -1;
675  nfreep->s.Types = 0;
676  nfreep->s.noccupied = 0;
677  nfreep->father = -1;
678  nfreep->sibling = -1;
679  nfreep->f.TopLevel = 1;
680  nfreep->f.InternalTopLevel = 0;
681  nfreep->f.DependsOnLocalMass = 0;
682  nfreep->f.ChildType = PARTICLE_NODE_TYPE;
683  nfreep->f.unused = 0;
684  memset(&(nfreep->mom.cofm),0,3*sizeof(MyFloat));
685  nfreep->mom.mass = 0;
686  nfreep->mom.hmax = 0;
687  nnext++;
688  /* create a set of empty nodes corresponding to the top-level ddecomp
689  * grid. We need to generate these nodes first to make sure that we have a
690  * complete top-level tree which allows the easy insertion of the
691  * pseudo-particles in the right place */
692  force_create_node_for_topnode(tb.firstnode, 0, tb.Nodes, ddecomp, 1, 0, 0, 0, &nnext, tb.lastnode);
693  }
694  /* Set up thread-local copies of the topnodes to anchor the subtrees. */
695  int ThisTask, j, t;
696  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
697  const int StartLeaf = ddecomp->Tasks[ThisTask].StartLeaf;
698  const int EndLeaf = ddecomp->Tasks[ThisTask].EndLeaf;
699  const int nthr = omp_get_max_threads();
700  int * topnodes = ta_malloc("topnodes", int, (EndLeaf - StartLeaf) * nthr);
701  /* Topnodes for each thread. For tid 0, just use the real tree. Saves copying the tree back.*/
702  for(j = 0; j < EndLeaf - StartLeaf; j++)
703  topnodes[j] = ddecomp->TopLeaves[j + StartLeaf].treenode;
704  /* Other threads need a copy*/
705  for(t = 1; t < nthr; t++) {
706  for(j = 0; j < EndLeaf - StartLeaf; j++) {
707  /* Make a local copy*/
708  topnodes[j + t * (EndLeaf - StartLeaf)] = nnext;
709  memmove(&tb.Nodes[nnext], &tb.Nodes[topnodes[j]], sizeof(struct NODE));
710  nnext++;
711  }
712  }
713 
714 /* double tstart = second(); */
715  const int first_free = nnext;
716  /* Increment nnext for the threads we are about to initialise.*/
717  nnext += NODECACHE_SIZE * nthr;
718  /* now we insert all particles */
719  #pragma omp parallel
720  {
721  int i;
722  /* Local topnodes*/
723  int tid = omp_get_thread_num();
724  const int * const local_topnodes = topnodes + tid * (EndLeaf - StartLeaf);
725 
726  /* This implements a small thread-local free Node cache.
727  * The cache ensures that Nodes from the same (or close) particles
728  * are created close to each other on the Node list and thus
729  * helps cache locality. I tried each thread getting a separate
730  * part of the tree, and it wasted too much memory. */
731  struct NodeCache nc;
732  nc.nnext_thread = first_free + tid * NODECACHE_SIZE;
734 
735  /* Stores the last-seen node on this thread.
736  * Since most particles are close to each other, this should save a number of tree walks.*/
737  int this_acc = local_topnodes[0];
738  // message(1, "Topnodes %d real %d\n", local_topnodes[0], topnodes[0]);
739 
740  /* The default schedule is static with a chunk 1/4 the total.
741  * However, particles are sorted by type and then by peano order.
742  * Since we need to merge trees, it is advantageous to have all particles
743  * spatially close be processed by the same thread. This means that the threads should
744  * process particles at a constant offset from the start of the type.
745  * We do this with a static schedule. */
746  int chnksz = PartManager->NumPart/nthr;
747  if(SlotsManager->info[0].enabled && SlotsManager->info[0].size > 0)
748  chnksz = SlotsManager->info[0].size/nthr;
749  if(chnksz < 1000)
750  chnksz = 1000;
751  #pragma omp for schedule(static, chnksz)
752  for(i = 0; i < npart; i++)
753  {
754  /*Can't break from openmp for*/
755  if(nc.nnext_thread >= tb.lastnode)
756  continue;
757 
758  /* Do not add types that do not have their mask bit set.*/
759  if(!((1<<P[i].Type) & mask)) {
760  continue;
761  }
762  /* Do not add garbage/swallowed particles to the tree*/
763  if(P[i].IsGarbage || (P[i].Swallowed && P[i].Type==5))
764  continue;
765 
766  if(P[i].Mass == 0)
767  endrun(12, "Zero mass particle %d type %d id %ld pos %g %g %g\n", i, P[i].Type, P[i].ID, P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
768  /*First find the Node for the TopLeaf */
769  int cur;
770  if(inside_node(&tb.Nodes[this_acc], i)) {
771  cur = this_acc;
772  } else {
773  /* Get the topnode to which a particle belongs. Each local tree
774  * has a local set of treenodes copying the global topnodes, except tid 0
775  * which has the real topnodes.*/
776  const int topleaf = domain_get_topleaf(P[i].Key, ddecomp);
777  //int treenode = ddecomp->TopLeaves[topleaf].treenode;
778  cur = local_topnodes[topleaf - StartLeaf];
779  }
780 
781  this_acc = add_particle_to_tree(i, cur, tb, HybridNuGrav, &nc, &nnext);
782  }
783  /* The implicit omp-barrier is important here!*/
784 /* double tend = second(); */
785 /* message(0, "Initial insertion: %.3g ms. First node %d\n", (tend - tstart)*1000, local_topnodes[0]); */
786 
787  /* Merge each topnode separately, using a for loop.
788  * This wastes threads if NTHREAD > NTOPNODES, but it
789  * means only one merge is done per subtree and
790  * it requires no locking.*/
791  #pragma omp for schedule(static, 1)
792  for(i = 0; i < EndLeaf - StartLeaf; i++) {
793  int t;
794  /* These are the addresses of the real topnodes*/
795  const int target = topnodes[i];
796  if(nc.nnext_thread >= tb.lastnode)
797  continue;
798  for(t = 1; t < nthr; t++) {
799  const int righttop = topnodes[i + t * (EndLeaf - StartLeaf)];
800 // message(1, "tid = %d i = %d t = %d Merging %d to %d addresses are %lx - %lx end is %lx\n", omp_get_thread_num(), i, t, righttop, target, &tb.Nodes[righttop], &tb.Nodes[target], &tb.Nodes[nnext]);
801  if(merge_partial_force_trees(target, righttop, &nc, &nnext, tb, HybridNuGrav))
802  break;
803  }
804  }
805  }
806 
807  ta_free(topnodes);
808  return nnext - tb.firstnode;
809 }
810 
818 void force_create_node_for_topnode(int no, int topnode, struct NODE * Nodes, const DomainDecomp * ddecomp, int bits, int x, int y, int z, int *nextfree, const int lastnode)
819 {
820  int i, j, k;
821 
822  /*We reached the leaf of the toptree*/
823  if(ddecomp->TopNodes[topnode].Daughter < 0)
824  return;
825 
826  for(i = 0; i < 2; i++)
827  for(j = 0; j < 2; j++)
828  for(k = 0; k < 2; k++)
829  {
830  int sub = 7 & peano_hilbert_key((x << 1) + i, (y << 1) + j, (z << 1) + k, bits);
831 
832  int count = i + 2 * j + 4 * k;
833 
834  Nodes[no].s.Types = 0;
835  Nodes[no].s.suns[count] = *nextfree;
836  /*We are an internal top level node as we now have a child top level.*/
837  Nodes[no].f.InternalTopLevel = 1;
838  Nodes[no].f.ChildType = NODE_NODE_TYPE;
839  Nodes[no].s.noccupied = (1<<16);
840 
841  /* We create a new leaf node.*/
842  init_internal_node(&Nodes[*nextfree], &Nodes[no], count);
843  /*Set father of new node*/
844  Nodes[*nextfree].father = no;
845  /*All nodes here are top level nodes*/
846  Nodes[*nextfree].f.TopLevel = 1;
847 
848  const struct topnode_data curtopnode = ddecomp->TopNodes[ddecomp->TopNodes[topnode].Daughter + sub];
849  if(curtopnode.Daughter == -1) {
850  ddecomp->TopLeaves[curtopnode.Leaf].treenode = *nextfree;
851  }
852 
853  (*nextfree)++;
854 
855  if(*nextfree >= lastnode)
856  endrun(11, "Not enough force nodes to topnode grid: need %d\n",lastnode);
857  }
858  /* Set sibling on the child*/
859  for(j=0; j<7; j++) {
860  int chld = Nodes[no].s.suns[j];
861  Nodes[chld].sibling = Nodes[no].s.suns[j+1];
862  }
863  Nodes[Nodes[no].s.suns[7]].sibling = Nodes[no].sibling;
864  for(i = 0; i < 2; i++)
865  for(j = 0; j < 2; j++)
866  for(k = 0; k < 2; k++)
867  {
868  int sub = 7 & peano_hilbert_key((x << 1) + i, (y << 1) + j, (z << 1) + k, bits);
869  int count = i + 2 * j + 4 * k;
870  force_create_node_for_topnode(Nodes[no].s.suns[count], ddecomp->TopNodes[topnode].Daughter + sub, Nodes, ddecomp,
871  bits + 1, 2 * x + i, 2 * y + j, 2 * z + k, nextfree, lastnode);
872  }
873 
874 }
875 
882 static void
884 {
885  int i, index;
886  const int firstpseudo = tree->lastnode;
887  int ThisTask;
888  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
889 
890  for(i = 0; i < ddecomp->NTopLeaves; i++)
891  {
892  index = ddecomp->TopLeaves[i].treenode;
893  if(ddecomp->TopLeaves[i].Task != ThisTask) {
894  if(tree->Nodes[index].s.noccupied != 0)
895  endrun(5, "In node %d, overwriting %d child particles (i = %d etc) with pseudo particle %d\n",
896  index, tree->Nodes[index].s.noccupied, tree->Nodes[index].s.suns[0], i);
897  tree->Nodes[index].f.ChildType = PSEUDO_NODE_TYPE;
898  /* This node points to the pseudo particle*/
899  tree->Nodes[index].s.suns[0] = firstpseudo + i;
900  }
901  }
902 }
903 
904 int
905 force_get_father(int no, const ForceTree * tree)
906 {
907  if(no >= tree->firstnode)
908  return tree->Nodes[no].father;
909  else
910  return tree->Father[no];
911 }
912 
913 static void
914 add_particle_moment_to_node(struct NODE * pnode, int i)
915 {
916  int k;
917  pnode->mom.mass += (P[i].Mass);
918  for(k=0; k<3; k++)
919  pnode->mom.cofm[k] += (P[i].Mass * P[i].Pos[k]);
920 
921  if(P[i].Type == 0)
922  {
923  int j;
924  /* Maximal distance any of the member particles peek out from the side of the node.
925  * May be at most hmax, as |Pos - Center| < len.*/
926  for(j = 0; j < 3; j++) {
927  pnode->mom.hmax = DMAX(pnode->mom.hmax, fabs(P[i].Pos[j] - pnode->center[j]) + P[i].Hsml - pnode->len);
928  }
929  }
930 }
931 
932 /*Get the sibling of a node, using the suns array. Only to be used in the tree build, before update_node_recursive is called.*/
933 static int
934 force_get_sibling(const int sib, const int j, const int * suns)
935 {
936  /* check if we have a sibling on the same level */
937  int jj;
938  int nextsib = sib;
939  for(jj = j + 1; jj < 8; jj++) {
940  if(suns[jj] >= 0) {
941  nextsib = suns[jj];
942  break;
943  }
944  }
945  return nextsib;
946 }
947 
948 /* Set the center of mass of the current node*/
949 static void
951 {
952 #ifdef DEBUG
953  if(tree->Nodes[no].f.ChildType != PARTICLE_NODE_TYPE)
954  endrun(3, "force_update_particle_node called on node %d of wrong type!\n", no);
955 #endif
956  int j;
957  /*Set the center of mass moments*/
958  const double mass = tree->Nodes[no].mom.mass;
959  /* Be careful about empty nodes*/
960  if(mass > 0) {
961  for(j = 0; j < 3; j++)
962  tree->Nodes[no].mom.cofm[j] /= mass;
963  }
964  else {
965  for(j = 0; j < 3; j++)
966  tree->Nodes[no].mom.cofm[j] = tree->Nodes[no].center[j];
967  }
968 }
969 
981 static int
982 force_update_node_recursive(int no, int sib, int level, const ForceTree * tree)
983 {
984 #ifdef DEBUG
985  if(tree->Nodes[no].f.ChildType != NODE_NODE_TYPE)
986  endrun(3, "force_update_node_recursive called on node %d of type %d != %d!\n", no, tree->Nodes[no].f.ChildType, NODE_NODE_TYPE);
987 #endif
988  int j;
989  int * suns = tree->Nodes[no].s.suns;
990 
991  int childcnt = 0;
992  /* Remove any empty children, moving the suns array around
993  * so non-empty entries are contiguous at the beginning of the array.
994  * This sharply reduces the size of the tree.
995  * Also count the node children for thread balancing.*/
996  int jj = 0;
997  for(j=0; j < 8; j++, jj++) {
998  /* Never remove empty top-level nodes so we don't
999  * mess up the pseudo-data exchange.
1000  * This may happen for a pseudo particle host or, in very rare cases,
1001  * when one of the local domains is empty. */
1002  while(jj < 8 && !tree->Nodes[suns[jj]].f.TopLevel &&
1003  tree->Nodes[suns[jj]].f.ChildType == PARTICLE_NODE_TYPE &&
1004  tree->Nodes[suns[jj]].s.noccupied == 0) {
1005  jj++;
1006  }
1007  if(jj < 8)
1008  suns[j] = suns[jj];
1009  else
1010  suns[j] = -1;
1011  if(suns[j] >= 0 && tree->Nodes[suns[j]].f.ChildType == NODE_NODE_TYPE)
1012  childcnt++;
1013  }
1014 
1015  /*First do the children*/
1016  for(j = 0; j < 8; j++)
1017  {
1018  int p = suns[j];
1019  /*Empty slot*/
1020  if(p < 0)
1021  continue;
1022  const int nextsib = force_get_sibling(sib, j, suns);
1023  /* This is set in create_nodes but needed because we may remove empty nodes above.*/
1024  tree->Nodes[p].sibling = nextsib;
1025  /* Nodes containing particles or pseudo-particles*/
1026  if(tree->Nodes[p].f.ChildType == PARTICLE_NODE_TYPE)
1027  force_update_particle_node(p, tree);
1028  if(tree->Nodes[p].f.ChildType == NODE_NODE_TYPE) {
1029  /* Don't spawn a new task if we are deep enough that we already spawned a lot.*/
1030  if(childcnt > 1 && level < 512) {
1031  #pragma omp task default(none) shared(level, childcnt, tree) firstprivate(nextsib, p)
1032  force_update_node_recursive(p, nextsib, level*childcnt, tree);
1033  }
1034  else
1035  force_update_node_recursive(p, nextsib, level, tree);
1036  }
1037  }
1038 
1039  /*Make sure all child nodes are done*/
1040  #pragma omp taskwait
1041 
1042  /*Now we do the moments*/
1043  for(j = 0; j < 8; j++)
1044  {
1045  const int p = suns[j];
1046  if(p < 0)
1047  continue;
1048  tree->Nodes[no].mom.mass += (tree->Nodes[p].mom.mass);
1049  tree->Nodes[no].mom.cofm[0] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[0]);
1050  tree->Nodes[no].mom.cofm[1] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[1]);
1051  tree->Nodes[no].mom.cofm[2] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[2]);
1052  if(tree->Nodes[p].mom.hmax > tree->Nodes[no].mom.hmax)
1053  tree->Nodes[no].mom.hmax = tree->Nodes[p].mom.hmax;
1054  }
1055 
1056  /*Set the center of mass moments*/
1057  const double mass = tree->Nodes[no].mom.mass;
1058  /* In principle all the children could be pseudo-particles*/
1059  if(mass > 0) {
1060  tree->Nodes[no].mom.cofm[0] /= mass;
1061  tree->Nodes[no].mom.cofm[1] /= mass;
1062  tree->Nodes[no].mom.cofm[2] /= mass;
1063  }
1064 
1065  return -1;
1066 }
1067 
1081 void force_update_node_parallel(const ForceTree * tree, const DomainDecomp * ddecomp)
1082 {
1083  int ThisTask;
1084  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
1085 
1086 #pragma omp parallel
1087 #pragma omp single nowait
1088  {
1089  int i;
1090  for(i = ddecomp->Tasks[ThisTask].StartLeaf; i < ddecomp->Tasks[ThisTask].EndLeaf; i ++) {
1091  const int no = ddecomp->TopLeaves[i].treenode;
1092  /* Nodes containing other nodes: the overwhelmingly likely case.*/
1093  if(tree->Nodes[no].f.ChildType == NODE_NODE_TYPE) {
1094  #pragma omp task default(none) shared(tree) firstprivate(no)
1095  force_update_node_recursive(no, tree->Nodes[no].sibling, 1, tree);
1096  }
1097  else if(tree->Nodes[no].f.ChildType == PARTICLE_NODE_TYPE)
1098  force_update_particle_node(no, tree);
1099  else if(tree->Nodes[no].f.ChildType == PSEUDO_NODE_TYPE)
1100  endrun(5, "Error, found pseudo node %d but domain entry %d says on task %d\n", no, i, ThisTask);
1101  }
1102  }
1103 }
1104 
1109 void force_exchange_pseudodata(ForceTree * tree, const DomainDecomp * ddecomp)
1110 {
1111  int NTask, ThisTask;
1112  int i, no, ta, recvTask;
1113  int *recvcounts, *recvoffset;
1114  struct topleaf_momentsdata
1115  {
1116  MyFloat s[3];
1117  MyFloat mass;
1118  MyFloat hmax;
1119  }
1120  *TopLeafMoments;
1121 
1122  MPI_Comm_size(MPI_COMM_WORLD, &NTask);
1123  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
1124 
1125  TopLeafMoments = (struct topleaf_momentsdata *) mymalloc("TopLeafMoments", ddecomp->NTopLeaves * sizeof(TopLeafMoments[0]));
1126  memset(&TopLeafMoments[0], 0, sizeof(TopLeafMoments[0]) * ddecomp->NTopLeaves);
1127 
1128  for(i = ddecomp->Tasks[ThisTask].StartLeaf; i < ddecomp->Tasks[ThisTask].EndLeaf; i ++) {
1129  no = ddecomp->TopLeaves[i].treenode;
1130  if(ddecomp->TopLeaves[i].Task != ThisTask)
1131  endrun(131231231, "TopLeaf %d Task table is corrupted: task is %d\n", i, ddecomp->TopLeaves[i].Task);
1132 
1133  /* read out the multipole moments from the local base cells */
1134  TopLeafMoments[i].s[0] = tree->Nodes[no].mom.cofm[0];
1135  TopLeafMoments[i].s[1] = tree->Nodes[no].mom.cofm[1];
1136  TopLeafMoments[i].s[2] = tree->Nodes[no].mom.cofm[2];
1137  TopLeafMoments[i].mass = tree->Nodes[no].mom.mass;
1138  TopLeafMoments[i].hmax = tree->Nodes[no].mom.hmax;
1139 
1140  /*Set the local base nodes dependence on local mass*/
1141  while(no >= 0)
1142  {
1143 #ifdef DEBUG
1144  if(tree->Nodes[no].f.ChildType == PSEUDO_NODE_TYPE)
1145  endrun(333, "Pseudo node %d parent of a leaf on this processor %d\n", no, ddecomp->TopLeaves[i].treenode);
1146 #endif
1147  if(tree->Nodes[no].f.DependsOnLocalMass)
1148  break;
1149 
1150  tree->Nodes[no].f.DependsOnLocalMass = 1;
1151 
1152  no = tree->Nodes[no].father;
1153  }
1154  }
1155 
1156  /* share the pseudo-particle data across CPUs */
1157 
1158  recvcounts = (int *) mymalloc("recvcounts", sizeof(int) * NTask);
1159  recvoffset = (int *) mymalloc("recvoffset", sizeof(int) * NTask);
1160 
1161  for(recvTask = 0; recvTask < NTask; recvTask++)
1162  {
1163  recvoffset[recvTask] = ddecomp->Tasks[recvTask].StartLeaf * sizeof(TopLeafMoments[0]);
1164  recvcounts[recvTask] = (ddecomp->Tasks[recvTask].EndLeaf - ddecomp->Tasks[recvTask].StartLeaf) * sizeof(TopLeafMoments[0]);
1165  }
1166 
1167  MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
1168  &TopLeafMoments[0], recvcounts, recvoffset,
1169  MPI_BYTE, MPI_COMM_WORLD);
1170 
1171  myfree(recvoffset);
1172  myfree(recvcounts);
1173 
1174 
1175  for(ta = 0; ta < NTask; ta++) {
1176  if(ta == ThisTask) continue; /* bypass ThisTask since it is already up to date */
1177 
1178  for(i = ddecomp->Tasks[ta].StartLeaf; i < ddecomp->Tasks[ta].EndLeaf; i ++) {
1179  no = ddecomp->TopLeaves[i].treenode;
1180 
1181  tree->Nodes[no].mom.cofm[0] = TopLeafMoments[i].s[0];
1182  tree->Nodes[no].mom.cofm[1] = TopLeafMoments[i].s[1];
1183  tree->Nodes[no].mom.cofm[2] = TopLeafMoments[i].s[2];
1184  tree->Nodes[no].mom.mass = TopLeafMoments[i].mass;
1185  tree->Nodes[no].mom.hmax = TopLeafMoments[i].hmax;
1186  }
1187  }
1188  myfree(TopLeafMoments);
1189 }
1190 
1191 
1195 void force_treeupdate_pseudos(int no, const ForceTree * tree)
1196 {
1197  int j, p;
1198  MyFloat hmax;
1199  MyFloat s[3], mass;
1200 
1201  mass = 0;
1202  s[0] = 0;
1203  s[1] = 0;
1204  s[2] = 0;
1205  hmax = 0;
1206 
1207  /* This happens if we have a trivial domain with only one entry*/
1208  if(!tree->Nodes[no].f.InternalTopLevel)
1209  return;
1210 
1211  p = tree->Nodes[no].s.suns[0];
1212 
1213  /* since we are dealing with top-level nodes, we know that there are 8 consecutive daughter nodes */
1214  for(j = 0; j < 8; j++)
1215  {
1216  /*This may not happen as we are an internal top level node*/
1217  if(p < tree->firstnode || p >= tree->lastnode)
1218  endrun(6767, "Updating pseudos: %d -> %d which is not an internal node between %d and %d.",no, p, tree->firstnode, tree->lastnode);
1219 #ifdef DEBUG
1220  /* Check we don't move to another part of the tree*/
1221  if(tree->Nodes[p].father != no)
1222  endrun(6767, "Tried to update toplevel node %d with parent %d != expected %d\n", p, tree->Nodes[p].father, no);
1223 #endif
1224 
1225  if(tree->Nodes[p].f.InternalTopLevel)
1226  force_treeupdate_pseudos(p, tree);
1227 
1228  mass += (tree->Nodes[p].mom.mass);
1229  s[0] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[0]);
1230  s[1] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[1]);
1231  s[2] += (tree->Nodes[p].mom.mass * tree->Nodes[p].mom.cofm[2]);
1232 
1233  if(tree->Nodes[p].mom.hmax > hmax)
1234  hmax = tree->Nodes[p].mom.hmax;
1235 
1236  p = tree->Nodes[p].sibling;
1237  }
1238 
1239  if(mass)
1240  {
1241  s[0] /= mass;
1242  s[1] /= mass;
1243  s[2] /= mass;
1244  }
1245  else
1246  {
1247  s[0] = tree->Nodes[no].center[0];
1248  s[1] = tree->Nodes[no].center[1];
1249  s[2] = tree->Nodes[no].center[2];
1250  }
1251 
1252  tree->Nodes[no].mom.cofm[0] = s[0];
1253  tree->Nodes[no].mom.cofm[1] = s[1];
1254  tree->Nodes[no].mom.cofm[2] = s[2];
1255  tree->Nodes[no].mom.mass = mass;
1256 
1257  tree->Nodes[no].mom.hmax = hmax;
1258 }
1259 
1271 void force_update_hmax(int * activeset, int size, ForceTree * tree, DomainDecomp * ddecomp)
1272 {
1273  int NTask, ThisTask, recvTask;
1274  int i, ta;
1275  int *recvcounts, *recvoffset;
1276 
1277  walltime_measure("/Misc");
1278 
1279  /* If hmax has not yet been computed, do all particles*/
1280  if(!tree->hmax_computed_flag) {
1281  activeset = NULL;
1282  size = PartManager->NumPart;
1283  }
1284  MPI_Comm_size(MPI_COMM_WORLD, &NTask);
1285  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
1286 
1287  #pragma omp parallel for
1288  for(i = 0; i < size; i++)
1289  {
1290  const int p_i = activeset ? activeset[i] : i;
1291 
1292  if(P[p_i].Type != 0 || P[p_i].IsGarbage)
1293  continue;
1294 
1295  int no = tree->Father[p_i];
1296 
1297  while(no >= 0)
1298  {
1299  /* How much does this particle peek beyond this node?
1300  * Note len does not change so we can read it without a lock or atomic. */
1301  MyFloat readhmax, newhmax = 0;
1302  int j, done = 0;
1303  for(j = 0; j < 3; j++) {
1304  /* Compute each direction independently and take the maximum.
1305  * This is the largest possible distance away from node center within a cube bounding hsml.
1306  * Note that because Pos - Center < len, the maximum value this can have is Hsml.*/
1307  newhmax = DMAX(newhmax, fabs(P[p_i].Pos[j] - tree->Nodes[no].center[j]) + P[p_i].Hsml - tree->Nodes[no].len);
1308  }
1309  /* Most particles will lie fully inside a node. No need then for the atomic! */
1310  if(newhmax <= 0)
1311  break;
1312 
1313  #pragma omp atomic read
1314  readhmax = tree->Nodes[no].mom.hmax;
1315  do {
1316  if(newhmax <= readhmax) {
1317  done = 1;
1318  break;
1319  }
1320  /* Swap in the new hmax only if the old one hasn't changed. */
1321  } while(!__atomic_compare_exchange(&(tree->Nodes[no].mom.hmax), &readhmax, &newhmax, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
1322 
1323  if(done)
1324  break;
1325  no = tree->Nodes[no].father;
1326  }
1327  }
1328 
1329  double * TopLeafhmax = (double *) mymalloc("TopLeafMoments", ddecomp->NTopLeaves * sizeof(double));
1330  memset(&TopLeafhmax[0], 0, sizeof(double) * ddecomp->NTopLeaves);
1331 
1332  for(i = ddecomp->Tasks[ThisTask].StartLeaf; i < ddecomp->Tasks[ThisTask].EndLeaf; i ++) {
1333  int no = ddecomp->TopLeaves[i].treenode;
1334  TopLeafhmax[i] = tree->Nodes[no].mom.hmax;
1335  }
1336 
1337  /* share the hmax-data of the dirty nodes accross CPUs */
1338  recvcounts = (int *) mymalloc("recvcounts", sizeof(int) * NTask);
1339  recvoffset = (int *) mymalloc("recvoffset", sizeof(int) * NTask);
1340 
1341  for(recvTask = 0; recvTask < NTask; recvTask++)
1342  {
1343  recvoffset[recvTask] = ddecomp->Tasks[recvTask].StartLeaf;
1344  recvcounts[recvTask] = ddecomp->Tasks[recvTask].EndLeaf - ddecomp->Tasks[recvTask].StartLeaf;
1345  }
1346 
1347  MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
1348  &TopLeafhmax[0], recvcounts, recvoffset,
1349  MPI_DOUBLE, MPI_COMM_WORLD);
1350 
1351  myfree(recvoffset);
1352  myfree(recvcounts);
1353 
1354  for(ta = 0; ta < NTask; ta++) {
1355  if(ta == ThisTask)
1356  continue; /* bypass ThisTask since it is already up to date */
1357  for(i = ddecomp->Tasks[ta].StartLeaf; i < ddecomp->Tasks[ta].EndLeaf; i ++) {
1358  int no = ddecomp->TopLeaves[i].treenode;
1359  tree->Nodes[no].mom.hmax = TopLeafhmax[i];
1360 
1361  while(no >= 0)
1362  {
1363  if(TopLeafhmax[i] <= tree->Nodes[no].mom.hmax)
1364  break;
1365  tree->Nodes[no].mom.hmax = TopLeafhmax[i];
1366  no = tree->Nodes[no].father;
1367  }
1368  }
1369  }
1370  myfree(TopLeafhmax);
1371 
1372  tree->hmax_computed_flag = 1;
1373  walltime_measure("/Tree/HmaxUpdate");
1374 }
1375 
1381 ForceTree force_treeallocate(int64_t maxnodes, int64_t maxpart, DomainDecomp * ddecomp)
1382 {
1383  ForceTree tb;
1384 
1385  tb.Father = (int *) mymalloc("Father", maxpart * sizeof(int));
1386  tb.nfather = maxpart;
1387 #ifdef DEBUG
1388  memset(tb.Father, -1, maxpart * sizeof(int));
1389 #endif
1390  tb.Nodes_base = (struct NODE *) mymalloc("Nodes_base", (maxnodes + 1) * sizeof(struct NODE));
1391 #ifdef DEBUG
1392  memset(tb.Nodes_base, -1, (maxnodes + 1) * sizeof(struct NODE));
1393 #endif
1394  tb.firstnode = maxpart;
1395  tb.lastnode = maxpart + maxnodes;
1396  if(maxpart + maxnodes >= 1L<<30)
1397  endrun(5, "Size of tree overflowed for maxpart = %ld, maxnodes = %ld!\n", maxpart, maxnodes);
1398  tb.numnodes = 0;
1399  tb.Nodes = tb.Nodes_base - maxpart;
1400  tb.tree_allocated_flag = 1;
1401  tb.NTopLeaves = ddecomp->NTopLeaves;
1402  tb.TopLeaves = ddecomp->TopLeaves;
1403  return tb;
1404 }
1405 
1410 {
1412 
1413  if(!force_tree_allocated(tree))
1414  return;
1415  myfree(tree->Nodes_base);
1416  myfree(tree->Father);
1417  tree->tree_allocated_flag = 0;
1418 }
void dump_snapshot(const char *dump, const double Time, const Cosmology *CP, const char *OutputDir)
Definition: checkpoint.c:52
static int domain_get_topleaf(const peano_t key, const DomainDecomp *ddecomp)
Definition: domain.h:74
void message(int where, const char *fmt,...)
Definition: endrun.c:175
void endrun(int where, const char *fmt,...)
Definition: endrun.c:147
int event_listen(EventSpec *eh, eventfunc func, void *userdata)
Definition: event.c:6
int event_unlisten(EventSpec *eh, eventfunc func, void *userdata)
Definition: event.c:26
EventSpec EventSlotsFork
Definition: event.c:54
static void init_internal_node(struct NODE *nfreep, struct NODE *parent, int subnode)
Definition: forcetree.c:304
static ForceTree force_tree_build(int npart, int mask, DomainDecomp *ddecomp, const int HybridNuGrav, const int DoMoments, const char *EmergencyOutputDir)
Definition: forcetree.c:191
static void force_treeupdate_pseudos(int no, const ForceTree *tree)
Definition: forcetree.c:1195
int merge_partial_force_trees(int left, int right, struct NodeCache *nc, int *nnext, const struct ForceTree tb, int HybridNuGrav)
Definition: forcetree.c:533
static void force_create_node_for_topnode(int no, int topnode, struct NODE *Nodes, const DomainDecomp *ddecomp, int bits, int x, int y, int z, int *nextfree, const int lastnode)
Definition: forcetree.c:818
void force_tree_rebuild(ForceTree *tree, DomainDecomp *ddecomp, const int HybridNuGrav, const int DoMoments, const char *EmergencyOutputDir)
Definition: forcetree.c:140
int force_tree_create_nodes(const ForceTree tb, const int npart, int mask, DomainDecomp *ddecomp, const int HybridNuGrav)
Definition: forcetree.c:661
static void add_particle_moment_to_node(struct NODE *pnode, int i)
Definition: forcetree.c:914
int force_tree_allocated(const ForceTree *tree)
Definition: forcetree.c:134
int get_freenode(int *nnext, struct NodeCache *nc)
Definition: forcetree.c:345
ForceTree force_treeallocate(int64_t maxnodes, int64_t maxpart, DomainDecomp *ddecomp)
Definition: forcetree.c:1381
static int force_tree_eh_slots_fork(EIBase *event, void *userdata)
Definition: forcetree.c:110
void force_update_node_parallel(const ForceTree *tree, const DomainDecomp *ddecomp)
Definition: forcetree.c:1081
int get_subnode(const struct NODE *node, const int p_i)
Definition: forcetree.c:281
static int force_get_sibling(const int sib, const int j, const int *suns)
Definition: forcetree.c:934
void force_update_hmax(int *activeset, int size, ForceTree *tree, DomainDecomp *ddecomp)
Definition: forcetree.c:1271
static void force_exchange_pseudodata(ForceTree *tree, const DomainDecomp *ddecomp)
Definition: forcetree.c:1109
void force_tree_rebuild_mask(ForceTree *tree, DomainDecomp *ddecomp, int mask, const int HybridNuGrav, const char *EmergencyOutputDir)
Definition: forcetree.c:161
static int create_new_node_layer(int firstparent, int p_toplace, const int HybridNuGrav, const ForceTree tb, int *nnext, struct NodeCache *nc)
Definition: forcetree.c:376
void init_forcetree_params(const int FastParticleType)
Definition: forcetree.c:43
int add_particle_to_tree(int i, int cur_start, const ForceTree tb, const int HybridNuGrav, struct NodeCache *nc, int *nnext)
Definition: forcetree.c:487
void force_tree_free(ForceTree *tree)
Definition: forcetree.c:1409
static void force_insert_pseudo_particles(const ForceTree *tree, const DomainDecomp *ddecomp)
Definition: forcetree.c:883
static int inside_node(const struct NODE *node, const int p_i)
Definition: forcetree.c:291
int force_get_father(int no, const ForceTree *tree)
Definition: forcetree.c:905
static void force_update_particle_node(int no, const ForceTree *tree)
Definition: forcetree.c:950
static int force_update_node_recursive(int no, int sib, int level, const ForceTree *tree)
Definition: forcetree.c:982
static int modify_internal_node(int parent, int subnode, int p_toplace, const ForceTree tb, const int HybridNuGrav)
Definition: forcetree.c:361
#define NODECACHE_SIZE
Definition: forcetree.c:336
static struct forcetree_params ForceTreeParams
#define ALLMASK
Definition: forcetree.h:20
#define NODE_NODE_TYPE
Definition: forcetree.h:16
static int node_is_node(int no, const ForceTree *tree)
Definition: forcetree.h:145
#define PARTICLE_NODE_TYPE
Definition: forcetree.h:15
#define PSEUDO_NODE_TYPE
Definition: forcetree.h:17
static int node_is_pseudo_particle(int no, const ForceTree *tree)
Definition: forcetree.h:133
#define NMAXCHILD
Definition: forcetree.h:12
#define mymalloc(name, size)
Definition: mymalloc.h:15
#define ta_malloc(name, type, nele)
Definition: mymalloc.h:25
#define myrealloc(ptr, size)
Definition: mymalloc.h:18
#define ta_free(p)
Definition: mymalloc.h:28
#define mymalloc_usedbytes()
Definition: mymalloc.h:32
#define myfree(x)
Definition: mymalloc.h:19
struct part_manager_type PartManager[1]
Definition: partmanager.c:11
#define P
Definition: partmanager.h:88
peano_t peano_hilbert_key(int x, int y, int z, int bits)
Definition: peano.c:119
static Cosmology * CP
Definition: power.c:27
struct slots_manager_type SlotsManager[1]
Definition: slotsmanager.c:7
double Omega0
Definition: cosmology.h:10
double HubbleParam
Definition: cosmology.h:20
double OmegaLambda
Definition: cosmology.h:14
struct topnode_data * TopNodes
Definition: domain.h:36
struct task_data * Tasks
Definition: domain.h:41
int NTopNodes
Definition: domain.h:39
struct topleaf_data * TopLeaves
Definition: domain.h:37
int NTopLeaves
Definition: domain.h:40
Definition: event.h:8
int64_t child
Definition: slotsmanager.h:151
int nfather
Definition: forcetree.h:104
int NTopLeaves
Definition: forcetree.h:94
int moments_computed_flag
Definition: forcetree.h:82
int tree_allocated_flag
Definition: forcetree.h:78
double BoxSize
Definition: forcetree.h:106
int mask
Definition: forcetree.h:90
struct NODE * Nodes_base
Definition: forcetree.h:101
int * Father
Definition: forcetree.h:103
int hmax_computed_flag
Definition: forcetree.h:80
struct topleaf_data * TopLeaves
Definition: forcetree.h:92
struct NODE * Nodes
Definition: forcetree.h:97
int firstnode
Definition: forcetree.h:84
int numnodes
Definition: forcetree.h:88
int lastnode
Definition: forcetree.h:86
Definition: forcetree.h:39
int sibling
Definition: forcetree.h:40
struct NODE::@9 mom
unsigned int DependsOnLocalMass
Definition: forcetree.h:48
MyFloat center[3]
Definition: forcetree.h:43
unsigned int ChildType
Definition: forcetree.h:49
MyFloat mass
Definition: forcetree.h:56
unsigned int unused
Definition: forcetree.h:51
MyFloat hmax
Definition: forcetree.h:57
int father
Definition: forcetree.h:41
MyFloat cofm[3]
Definition: forcetree.h:55
unsigned int TopLevel
Definition: forcetree.h:47
struct NodeChild s
Definition: forcetree.h:66
unsigned int InternalTopLevel
Definition: forcetree.h:46
struct NODE::@8 f
MyFloat len
Definition: forcetree.h:42
int nrem_thread
Definition: forcetree.c:341
int nnext_thread
Definition: forcetree.c:340
int noccupied
Definition: forcetree.h:35
int suns[NMAXCHILD]
Definition: forcetree.h:32
unsigned int Types
Definition: forcetree.h:30
double TreeAllocFactor
Definition: forcetree.c:37
int FastParticleType
Definition: forcetree.c:39
int64_t size
Definition: slotsmanager.h:12
struct slot_info info[6]
Definition: slotsmanager.h:112
int EndLeaf
Definition: domain.h:30
int StartLeaf
Definition: domain.h:29
int Task
Definition: domain.h:21
int treenode
Definition: domain.h:24
int Leaf
Definition: domain.h:17
int Daughter
Definition: domain.h:15
int MPIU_Any(int condition, MPI_Comm comm)
Definition: system.c:545
static int atomic_fetch_and_add(int *ptr, int value)
Definition: system.h:78
#define MPIU_Barrier(comm)
Definition: system.h:103
#define MPI_INT64
Definition: system.h:12
int ThisTask
Definition: test_exchange.c:23
int NTask
Definition: test_exchange.c:23
#define DMAX(x, y)
Definition: test_interp.c:11
LOW_PRECISION MyFloat
Definition: types.h:19
#define walltime_measure(name)
Definition: walltime.h:8