MP-Gadget  5.0.1.dev1-76bc7d4726-dirty
petapm.c
Go to the documentation of this file.
1 #include <mpi.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 /* do NOT use complex.h it breaks the code */
7 
8 #include "types.h"
9 #include "petapm.h"
10 
11 #include "utils.h"
12 #include "walltime.h"
13 
14 static void
16  struct Layout * L,
17  double * meshbuf,
18  PetaPMRegion * regions,
19  const int Nregions,
20  MPI_Comm comm);
21 static void layout_finish(struct Layout * L);
22 static void layout_build_and_exchange_cells_to_pfft(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
23 static void layout_build_and_exchange_cells_to_local(PetaPM * pm, struct Layout * L, double * meshbuf, double * real);
24 
25 /* cell_iterator needs to be thread safe !*/
26 typedef void (* cell_iterator)(double * cell_value, double * comm_buffer);
27 static void layout_iterate_cells(PetaPM * pm, struct Layout * L, cell_iterator iter, double * real);
28 
29 struct Pencil { /* a pencil starting at offset, with lenght len */
30  int offset[3];
31  int len;
32  int first;
33  int meshbuf_first; /* first pixel in meshbuf */
34  int task;
35 };
36 static int pencil_cmp_target(const void * v1, const void * v2);
37 static int pos_get_target(PetaPM * pm, const int pos[2]);
38 
39 /* FIXME: move this to MPIU_. */
40 static int64_t reduce_int64(int64_t input, MPI_Comm comm);
41 #ifdef DEBUG
42 /* for debugging */
43 static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize);
44 #endif
45 
46 static MPI_Datatype MPI_PENCIL;
47 
48 /*Used only in MP-GenIC*/
49 pfft_complex *
51 {
52  pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
53  memset(rho_k, 0, pm->priv->fftsize * sizeof(double));
54  return rho_k;
55 }
56 
57 static void pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions);
58 
59 static PetaPMParticleStruct * CPS; /* stored by petapm_force, how to access the P array */
60 #define POS(i) ((double*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_pos]))
61 #define MASS(i) ((float*) (&((char*)CPS->Parts)[CPS->elsize * (i) + CPS->offset_mass]))
62 #define INACTIVE(i) (CPS->active && !CPS->active(i))
63 
65  return &pm->fourier_space_region;
66 }
68  return &pm->real_space_region;
69 }
70 int petapm_mesh_to_k(PetaPM * pm, int i) {
71  /*Return the position of this point on the Fourier mesh*/
72  return i<=pm->Nmesh/2 ? i : (i-pm->Nmesh);
73 }
75  return pm->ThisTask2d;
76 }
78  return pm->NTask2d;
79 }
80 
81 void
82 petapm_module_init(int Nthreads)
83 {
84  pfft_init();
85 
86  pfft_plan_with_nthreads(Nthreads);
87 
88  /* initialize the MPI Datatype of pencil */
89  MPI_Type_contiguous(sizeof(struct Pencil), MPI_BYTE, &MPI_PENCIL);
90  MPI_Type_commit(&MPI_PENCIL);
91 }
92 
93 void
94 petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
95 {
96  /* define the global long / short range force cut */
97  pm->BoxSize = BoxSize;
98  pm->Asmth = Asmth;
99  pm->Nmesh = Nmesh;
100  pm->G = G;
101  pm->CellSize = BoxSize / Nmesh;
102  pm->comm = comm;
103 
104  ptrdiff_t n[3] = {Nmesh, Nmesh, Nmesh};
105  ptrdiff_t np[2];
106 
107  int ThisTask;
108  int NTask;
109 
110  pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
111  pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
112 
113  MPI_Comm_rank(comm, &ThisTask);
114  MPI_Comm_size(comm, &NTask);
115 
116  /* try to find a square 2d decomposition */
117  int i;
118  int k;
119  for(i = sqrt(NTask) + 1; i >= 0; i --) {
120  if(NTask % i == 0) break;
121  }
122  np[0] = i;
123  np[1] = NTask / i;
124 
125  message(0, "Using 2D Task mesh %td x %td \n", np[0], np[1]);
126  if( pfft_create_procmesh_2d(comm, np[0], np[1], &pm->priv->comm_cart_2d) ){
127  endrun(0, "Error: This test file only works with %td processes.\n", np[0]*np[1]);
128  }
129 
130  int periods_unused[2];
131  MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods_unused, pm->ThisTask2d);
132 
133  if(pm->NTask2d[0] != np[0]) abort();
134  if(pm->NTask2d[1] != np[1]) abort();
135 
136  pm->priv->fftsize = 2 * pfft_local_size_dft_r2c_3d(n, pm->priv->comm_cart_2d,
137  PFFT_TRANSPOSED_OUT,
140 
141  /*
142  * In fourier space, the transposed array is ordered in
143  * are in (y, z, x). The strides and sizes returned
144  * from local size is in (Nx, Ny, Nz), hence we roll them once
145  * so that the strides will give correct linear indexing for
146  * integer coordinates given in order of (y, z, x).
147  * */
148 
149 #define ROLL(a, N, j) { \
150  typeof(a[0]) tmp[N]; \
151  ptrdiff_t k; \
152  for(k = 0; k < N; k ++) tmp[k] = a[k]; \
153  for(k = 0; k < N; k ++) a[k] = tmp[(k + j)% N]; \
154  }
155 
156  ROLL(pm->fourier_space_region.offset, 3, 1);
157  ROLL(pm->fourier_space_region.size, 3, 1);
158 
159 #undef ROLL
160 
161  /* calculate the strides */
164 
165  /* planning the fft; need temporary arrays */
166 
167  double * real = (double * ) mymalloc("PMreal", pm->priv->fftsize * sizeof(double));
168  pfft_complex * rho_k = (pfft_complex * ) mymalloc("PMrho_k", pm->priv->fftsize * sizeof(double));
169  pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
170 
171  pm->priv->plan_forw = pfft_plan_dft_r2c_3d(
172  n, real, rho_k, pm->priv->comm_cart_2d, PFFT_FORWARD,
173  PFFT_TRANSPOSED_OUT | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
174  pm->priv->plan_back = pfft_plan_dft_c2r_3d(
175  n, complx, real, pm->priv->comm_cart_2d, PFFT_BACKWARD,
176  PFFT_TRANSPOSED_IN | PFFT_ESTIMATE | PFFT_TUNE | PFFT_DESTROY_INPUT);
177 
178  myfree(complx);
179  myfree(rho_k);
180  myfree(real);
181 
182  /* now lets fill up the mesh2task arrays */
183 
184 #if 0
185  message(1, "ThisTask = %d (%td %td %td) - (%td %td %td)\n", ThisTask,
186  pm->real_space_region.offset[0],
187  pm->real_space_region.offset[1],
188  pm->real_space_region.offset[2],
189  pm->real_space_region.size[0],
190  pm->real_space_region.size[1],
191  pm->real_space_region.size[2]);
192 #endif
193 
194  int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
195  for(k = 0; k < 2; k ++) {
196  for(i = 0; i < Nmesh; i ++) {
197  tmp[i] = 0;
198  }
199  for(i = 0; i < pm->real_space_region.size[k]; i ++) {
200  tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
201  }
202  /* which column / row hosts this tile? */
203  /* FIXME: this is very inefficient */
204  MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
205  /*
206  for(i = 0; i < Nmesh; i ++) {
207  message(0, "Mesh2Task[%d][%d] == %d\n", k, i, Mesh2Task[k][i]);
208  }
209  */
210  }
211  myfree(tmp);
212 }
213 
214 void
216 {
217  pfft_destroy_plan(pm->priv->plan_forw);
218  pfft_destroy_plan(pm->priv->plan_back);
219  MPI_Comm_free(&pm->priv->comm_cart_2d);
220  myfree(pm->Mesh2Task[0]);
221 }
222 
223 /*
224  * read out field to particle i, with value no need to be thread safe
225  * (particle i is never done by same thread)
226  * */
227 typedef void (* pm_iterator)(PetaPM * pm, int i, double * mesh, double weight);
228 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions);
229 /* apply transfer function to value, kpos array is in x, y, z order */
230 static void pm_apply_transfer_function(PetaPM * pm,
231  pfft_complex * src,
232  pfft_complex * dst, petapm_transfer_func H);
233 
234 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight);
235 
236 /*
237  * 1. calls prepare to build the Regions covering particles
238  * 2. CIC the particles
239  * 3. Transform to rho_k
240  * 4. apply global_transfer (if not NULL --
241  * this is the place to fill in gaussian seeds,
242  * the transfer is stacked onto all following transfers.
243  * 5. for each transfer, readout in functions
244  * 6. apply transfer from global_transfer -> complex
245  * 7. transform to real
246  * 8. readout
247  * 9. free regions
248  * */
249 
250 PetaPMRegion *
252  PetaPM * pm,
253  petapm_prepare_func prepare,
254  PetaPMParticleStruct * pstruct,
255  int * Nregions,
256  void * userdata) {
257  CPS = pstruct;
258 
259  *Nregions = 0;
260  PetaPMRegion * regions = prepare(pm, pstruct, userdata, Nregions);
261  pm_init_regions(pm, regions, *Nregions);
262 
263  walltime_measure("/PMgrav/Misc");
264  pm_iterate(pm, put_particle_to_mesh, regions, *Nregions);
265  walltime_measure("/PMgrav/cic");
266 
267  layout_prepare(pm, &pm->priv->layout, pm->priv->meshbuf, regions, *Nregions, pm->comm);
268 
269  walltime_measure("/PMgrav/comm");
270  return regions;
271 }
272 
273 pfft_complex * petapm_force_r2c(PetaPM * pm,
275  ) {
276  /* call pfft rho_k is CFT of rho */
277 
278  /* this is because
279  *
280  * CFT = DFT * dx **3
281  * CFT[rho] = DFT [rho * dx **3] = DFT[CIC]
282  * */
283  double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
284  memset(real, 0, sizeof(double) * pm->priv->fftsize);
286  walltime_measure("/PMgrav/comm2");
287 
288 #ifdef DEBUG
289  verify_density_field(pm, real, pm->priv->meshbuf, pm->priv->meshbufsize);
290  walltime_measure("/PMgrav/Misc");
291 #endif
292 
293  pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
294  pfft_execute_dft_r2c(pm->priv->plan_forw, real, complx);
295  myfree(real);
296 
297  pfft_complex * rho_k = (pfft_complex * ) mymalloc2("PMrho_k", pm->priv->fftsize * sizeof(double));
298 
299  /*Do any analysis that may be required before the transfer function is applied*/
301  if(global_readout)
302  pm_apply_transfer_function(pm, complx, rho_k, global_readout);
305  /*Apply the transfer function*/
307  pm_apply_transfer_function(pm, complx, rho_k, global_transfer);
308  walltime_measure("/PMgrav/r2c");
309 
310  report_memory_usage("PetaPM");
311 
312  myfree(complx);
313  return rho_k;
314 }
315 
316 void
318  pfft_complex * rho_k,
319  PetaPMRegion * regions,
320  const int Nregions,
322 {
323 
325  for (f = functions; f->name; f ++) {
326  petapm_transfer_func transfer = f->transfer;
327  petapm_readout_func readout = f->readout;
328 
329  pfft_complex * complx = (pfft_complex *) mymalloc("PMcomplex", pm->priv->fftsize * sizeof(double));
330  /* apply the greens function turn rho_k into potential in fourier space */
331  pm_apply_transfer_function(pm, rho_k, complx, transfer);
332  walltime_measure("/PMgrav/calc");
333 
334  double * real = (double * ) mymalloc2("PMreal", pm->priv->fftsize * sizeof(double));
335  pfft_execute_dft_c2r(pm->priv->plan_back, complx, real);
336  walltime_measure("/PMgrav/c2r");
337  myfree(complx);
338  /* read out the potential: this will copy and free real.*/
340  walltime_measure("/PMgrav/comm");
341 
342  pm_iterate(pm, readout, regions, Nregions);
343  walltime_measure("/PMgrav/readout");
344  }
345  walltime_measure("/PMgrav/Misc");
346 
347 }
349  layout_finish(&pm->priv->layout);
350  myfree(pm->priv->meshbuf);
351 }
352 
354  PetaPMGlobalFunctions * global_functions, //petapm_transfer_func global_transfer,
356  PetaPMParticleStruct * pstruct,
357  void * userdata) {
358  int Nregions;
359  PetaPMRegion * regions = petapm_force_init(pm, prepare, pstruct, &Nregions, userdata);
360  pfft_complex * rho_k = petapm_force_r2c(pm, global_functions);
361  if(functions)
362  petapm_force_c2r(pm, rho_k, regions, Nregions, functions);
363  myfree(rho_k);
364  if(CPS->RegionInd)
365  myfree(CPS->RegionInd);
366  myfree(regions);
368 }
369 
370 /* build a communication layout */
371 
372 static void layout_build_pencils(PetaPM * pm, struct Layout * L, double * meshbuf, PetaPMRegion * regions, const int Nregions);
373 static void layout_exchange_pencils(struct Layout * L);
374 static void
376  struct Layout * L,
377  double * meshbuf,
378  PetaPMRegion * regions,
379  const int Nregions,
380  MPI_Comm comm)
381 {
382  int r;
383  int i;
384  int NTask;
385  L->comm = comm;
386 
387  MPI_Comm_size(L->comm, &NTask);
388 
389  L->ibuffer = (int *) mymalloc("PMlayout", sizeof(int) * NTask * 8);
390 
391  memset(L->ibuffer, 0, sizeof(int) * NTask * 8);
392  L->NpSend = &L->ibuffer[NTask * 0];
393  L->NpRecv = &L->ibuffer[NTask * 1];
394  L->NcSend = &L->ibuffer[NTask * 2];
395  L->NcRecv = &L->ibuffer[NTask * 3];
396  L->DcSend = &L->ibuffer[NTask * 4];
397  L->DcRecv = &L->ibuffer[NTask * 5];
398  L->DpSend = &L->ibuffer[NTask * 6];
399  L->DpRecv = &L->ibuffer[NTask * 7];
400 
401  L->NpExport = 0;
402  L->NcExport = 0;
403  L->NpImport = 0;
404  L->NcImport = 0;
405 
406  int NpAlloc = 0;
407  /* count pencils until buffer would run out */
408  for (r = 0; r < Nregions; r ++) {
409  NpAlloc += regions[r].size[0] * regions[r].size[1];
410  }
411 
412  L->PencilSend = (struct Pencil *) mymalloc("PencilSend", NpAlloc * sizeof(struct Pencil));
413 
414  layout_build_pencils(pm, L, meshbuf, regions, Nregions);
415 
416  /* sort the pencils by the target rank for ease of next step */
417  qsort_openmp(L->PencilSend, NpAlloc, sizeof(struct Pencil), pencil_cmp_target);
418  /* zero length pixels are moved to the tail */
419 
420  /* now shrink NpExport*/
421  L->NpExport = NpAlloc;
422  while(L->NpExport > 0 && L->PencilSend[L->NpExport - 1].len == 0) {
423  L->NpExport --;
424  }
425 
426  /* count total number of cells to be exported */
427  int NcExport = 0;
428  for(i = 0; i < L->NpExport; i++) {
429  int task = L->PencilSend[i].task;
430  L->NcSend[task] += L->PencilSend[i].len;
431  NcExport += L->PencilSend[i].len;
432  L->NpSend[task] ++;
433  }
434  L->NcExport = NcExport;
435 
436  MPI_Alltoall(L->NpSend, 1, MPI_INT, L->NpRecv, 1, MPI_INT, L->comm);
437  MPI_Alltoall(L->NcSend, 1, MPI_INT, L->NcRecv, 1, MPI_INT, L->comm);
438 
439  /* build the displacement array; why doesn't MPI build these automatically? */
440  L->DpSend[0] = 0; L->DpRecv[0] = 0;
441  L->DcSend[0] = 0; L->DcRecv[0] = 0;
442  for(i = 1; i < NTask; i ++) {
443  L->DpSend[i] = L->NpSend[i - 1] + L->DpSend[i - 1];
444  L->DpRecv[i] = L->NpRecv[i - 1] + L->DpRecv[i - 1];
445  L->DcSend[i] = L->NcSend[i - 1] + L->DcSend[i - 1];
446  L->DcRecv[i] = L->NcRecv[i - 1] + L->DcRecv[i - 1];
447  }
448  L->NpImport = L->DpRecv[NTask -1] + L->NpRecv[NTask -1];
449  L->NcImport = L->DcRecv[NTask -1] + L->NcRecv[NTask -1];
450 
451  /* some checks */
452  if(L->DpSend[NTask - 1] + L->NpSend[NTask -1] != L->NpExport) {
453  endrun(1, "NpExport = %d NpSend=%d DpSend=%d\n", L->NpExport, L->NpSend[NTask -1], L->DpSend[NTask - 1]);
454  }
455  if(L->DcSend[NTask - 1] + L->NcSend[NTask -1] != L->NcExport) {
456  endrun(1, "NcExport = %d NcSend=%d DcSend=%d\n", L->NcExport, L->NcSend[NTask -1], L->DcSend[NTask - 1]);
457  }
458  int64_t totNpAlloc = reduce_int64(NpAlloc, L->comm);
459  int64_t totNpExport = reduce_int64(L->NpExport, L->comm);
460  int64_t totNcExport = reduce_int64(L->NcExport, L->comm);
461  int64_t totNpImport = reduce_int64(L->NpImport, L->comm);
462  int64_t totNcImport = reduce_int64(L->NcImport, L->comm);
463 
464  if(totNpExport != totNpImport) {
465  endrun(1, "totNpExport = %ld\n", totNpExport);
466  }
467  if(totNcExport != totNcImport) {
468  endrun(1, "totNcExport = %ld\n", totNcExport);
469  }
470 
471  /* exchange the pencils */
472  message(0, "PetaPM: %010ld/%010ld Pencils and %010ld Cells\n", totNpExport, totNpAlloc, totNcExport);
473  L->PencilRecv = (struct Pencil *) mymalloc("PencilRecv", L->NpImport * sizeof(struct Pencil));
474  memset(L->PencilRecv, 0xfc, L->NpImport * sizeof(struct Pencil));
476 }
477 
478 static void
480  struct Layout * L,
481  double * meshbuf,
482  PetaPMRegion * regions,
483  const int Nregions)
484 {
485  /* now build pencils to be exported */
486  int p0 = 0;
487  int r;
488  for (r = 0; r < Nregions; r++) {
489  int ix;
490 #pragma omp parallel for private(ix)
491  for(ix = 0; ix < regions[r].size[0]; ix++) {
492  int iy;
493  for(iy = 0; iy < regions[r].size[1]; iy++) {
494  int poffset = ix * regions[r].size[1] + iy;
495  struct Pencil * p = &L->PencilSend[p0 + poffset];
496 
497  p->offset[0] = ix + regions[r].offset[0];
498  p->offset[1] = iy + regions[r].offset[1];
499  p->offset[2] = regions[r].offset[2];
500  p->len = regions[r].size[2];
501  p->meshbuf_first = (regions[r].buffer - meshbuf) +
502  regions[r].strides[0] * ix +
503  regions[r].strides[1] * iy;
504  /* now lets compress the pencil */
505  while((p->len > 0) && (meshbuf[p->meshbuf_first + p->len - 1] == 0.0)) {
506  p->len --;
507  }
508  while((p->len > 0) && (meshbuf[p->meshbuf_first] == 0.0)) {
509  p->len --;
510  p->meshbuf_first++;
511  p->offset[2] ++;
512  }
513 
514  p->task = pos_get_target(pm, p->offset);
515  }
516  }
517  p0 += regions[r].size[0] * regions[r].size[1];
518  }
519 
520 }
521 
522 static void layout_exchange_pencils(struct Layout * L) {
523  int i;
524  int offset;
525  int NTask;
526  MPI_Comm_size(L->comm, &NTask);
527  /* build the first pointers to refer to the correct relative buffer locations */
528  /* note that the buffer hasn't bee assembled yet */
529  offset = 0;
530  for(i = 0; i < NTask; i ++) {
531  int j;
532  struct Pencil * p = &L->PencilSend[offset];
533  if(L->NpSend[i] == 0) continue;
534  p->first = 0;
535  for(j = 1; j < L->NpSend[i]; j++) {
536  p[j].first = p[j - 1].first + p[j - 1].len;
537  }
538  offset += L->NpSend[i];
539  }
540 
541  MPI_Alltoallv(
542  L->PencilSend, L->NpSend, L->DpSend, MPI_PENCIL,
543  L->PencilRecv, L->NpRecv, L->DpRecv, MPI_PENCIL,
544  L->comm);
545 
546  /* set first to point to absolute position in the full import cell buffer */
547  offset = 0;
548  for(i = 0; i < NTask; i ++) {
549  struct Pencil * p = &L->PencilRecv[offset];
550  int j;
551  for(j = 0; j < L->NpRecv[i]; j++) {
552  p[j].first += L->DcRecv[i];
553  }
554  offset += L->NpRecv[i];
555  }
556 
557  /* set first to point to absolute position in the full export cell buffer */
558  offset = 0;
559  for(i = 0; i < NTask; i ++) {
560  struct Pencil * p = &L->PencilSend[offset];
561  int j;
562  for(j = 0; j < L->NpSend[i]; j++) {
563  p[j].first += L->DcSend[i];
564  }
565  offset += L->NpSend[i];
566  }
567 }
568 
569 static void layout_finish(struct Layout * L) {
570  myfree(L->PencilRecv);
571  myfree(L->PencilSend);
572  myfree(L->ibuffer);
573 }
574 
575 /* exchange cells to their pfft host, then reduce the cells to the pfft
576  * array */
577 static void to_pfft(double * cell, double * buf) {
578 #pragma omp atomic update
579  cell[0] += buf[0];
580 }
581 
582 static void
584  PetaPM * pm,
585  struct Layout * L,
586  double * meshbuf,
587  double * real)
588 {
589  L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
590  L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
591 
592  int i;
593  int offset;
594 
595  /* collect all cells into the send buffer */
596  offset = 0;
597  for(i = 0; i < L->NpExport; i ++) {
598  struct Pencil * p = &L->PencilSend[i];
599  memcpy(L->BufSend + offset, &meshbuf[p->meshbuf_first],
600  sizeof(double) * p->len);
601  offset += p->len;
602  }
603 
604  /* receive cells */
605  MPI_Alltoallv(
606  L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
607  L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
608  L->comm);
609 
610 #if 0
611  double massExport = 0;
612  for(i = 0; i < L->NcExport; i ++) {
613  massExport += L->BufSend[i];
614  }
615 
616  double massImport = 0;
617  for(i = 0; i < L->NcImport; i ++) {
618  massImport += L->BufRecv[i];
619  }
620  double totmassExport;
621  double totmassImport;
622  MPI_Allreduce(&massExport, &totmassExport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
623  MPI_Allreduce(&massImport, &totmassImport, 1, MPI_DOUBLE, MPI_SUM, L->comm);
624  message(0, "totmassExport = %g totmassImport = %g\n", totmassExport, totmassImport);
625 #endif
626 
627  layout_iterate_cells(pm, L, to_pfft, real);
628  myfree(L->BufRecv);
629  myfree(L->BufSend);
630 }
631 
632 /* readout cells on their pfft host, then exchange the cells to the domain
633  * host */
634 static void to_region(double * cell, double * region) {
635  *region = *cell;
636 }
637 
638 static void
640  PetaPM * pm,
641  struct Layout * L,
642  double * meshbuf,
643  double * real)
644 {
645  L->BufRecv = (double *) mymalloc("PMBufRecv", L->NcImport * sizeof(double));
646  int i;
647  int offset;
648 
649  /*layout_iterate_cells transfers real to L->BufRecv*/
650  layout_iterate_cells(pm, L, to_region, real);
651 
652  /*Real is done now: reuse the memory for BufSend*/
653  myfree(real);
654  /*Now allocate BufSend, which is confusingly used to receive data*/
655  L->BufSend = (double *) mymalloc("PMBufSend", L->NcExport * sizeof(double));
656 
657  /* exchange cells */
658  /* notice the order is reversed from to_pfft */
659  MPI_Alltoallv(
660  L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
661  L->BufSend, L->NcSend, L->DcSend, MPI_DOUBLE,
662  L->comm);
663 
664  /* distribute BufSend to meshbuf */
665  offset = 0;
666  for(i = 0; i < L->NpExport; i ++) {
667  struct Pencil * p = &L->PencilSend[i];
668  memcpy(&meshbuf[p->meshbuf_first],
669  L->BufSend + offset,
670  sizeof(double) * p->len);
671  offset += p->len;
672  }
673  myfree(L->BufSend);
674  myfree(L->BufRecv);
675 }
676 
677 /* iterate over the pairs of real field cells and RecvBuf cells
678  *
679  * !!! iter has to be thread safe. !!!
680  * */
681 static void
683  struct Layout * L,
684  cell_iterator iter,
685  double * real)
686 {
687  int i;
688 #pragma omp parallel for
689  for(i = 0; i < L->NpImport; i ++) {
690  struct Pencil * p = &L->PencilRecv[i];
691  int k;
692  ptrdiff_t linear0 = 0;
693  for(k = 0; k < 2; k ++) {
694  int ix = p->offset[k];
695  while(ix < 0) ix += pm->Nmesh;
696  while(ix >= pm->Nmesh) ix -= pm->Nmesh;
697  ix -= pm->real_space_region.offset[k];
698  if(ix >= pm->real_space_region.size[k]) {
699  /* serious problem assumption about pfft layout was wrong*/
700  endrun(1, "bad pfft: original k: %d ix: %d, cur ix: %d, region: off %d size %d\n", k, p->offset[k], ix, pm->real_space_region.offset[k], pm->real_space_region.size[k]);
701  }
702  linear0 += ix * pm->real_space_region.strides[k];
703  }
704  int j;
705  for(j = 0; j < p->len; j ++) {
706  int iz = p->offset[2] + j;
707  while(iz < 0) iz += pm->Nmesh;
708  while(iz >= pm->Nmesh) iz -= pm->Nmesh;
709  if(iz >= pm->real_space_region.size[2]) {
710  /* serious problem assmpution about pfft layout was wrong*/
711  endrun(1, "bad pfft: original iz: %d, cur iz: %d, region: off %d size %d\n", p->offset[2], iz, pm->real_space_region.offset[2], pm->real_space_region.size[2]);
712  }
713  ptrdiff_t linear = iz * pm->real_space_region.strides[2] + linear0;
714  /*
715  * operate on the pencil, either modifying real or BufRecv
716  * */
717  iter(&real[linear], &L->BufRecv[p->first + j]);
718  }
719  }
720 }
721 
722 static void
723 pm_init_regions(PetaPM * pm, PetaPMRegion * regions, const int Nregions)
724 {
725  if(regions) {
726  int i;
727  size_t size = 0;
728  for(i = 0 ; i < Nregions; i ++) {
729  size += regions[i].totalsize;
730  }
731  pm->priv->meshbufsize = size;
732  if ( size == 0 ) return;
733  pm->priv->meshbuf = (double *) mymalloc("PMmesh", size * sizeof(double));
734  /* this takes care of the padding */
735  memset(pm->priv->meshbuf, 0, size * sizeof(double));
736  size = 0;
737  for(i = 0 ; i < Nregions; i ++) {
738  regions[i].buffer = pm->priv->meshbuf + size;
739  size += regions[i].totalsize;
740  }
741  }
742 }
743 
744 
745 static void
747  int i,
748  pm_iterator iterator,
749  PetaPMRegion * regions,
750  const int Nregions)
751 {
752  int k;
753  int iCell[3]; /* integer coordinate on the regional mesh */
754  double Res[3]; /* residual*/
755  double * Pos = POS(i);
756  const int RegionInd = CPS->RegionInd ? CPS->RegionInd[i] : 0;
757 
758  /* Asserts that the swallowed particles are not considered (region -2).*/
759  if(RegionInd < 0)
760  return;
761  /* This should never happen: it is pure paranoia and to avoid icc being crazy*/
762  if(RegionInd >= Nregions)
763  endrun(1, "Particle %d has region %d out of bounds %d\n", i, RegionInd, Nregions);
764 
765  PetaPMRegion * region = &regions[RegionInd];
766  for(k = 0; k < 3; k++) {
767  double tmp = Pos[k] / pm->CellSize;
768  iCell[k] = floor(tmp);
769  Res[k] = tmp - iCell[k];
770  iCell[k] -= region->offset[k];
771  /* seriously?! particles are supposed to be contained in cells */
772  if(iCell[k] >= region->size[k] - 1 || iCell[k] < 0) {
773  endrun(1, "particle out of cell better stop %d (k=%d) %g %g %g region: %td %td\n", iCell[k],k,
774  Pos[0], Pos[1], Pos[2],
775  region->offset[k], region->size[k]);
776  }
777  }
778 
779  int connection;
780  for(connection = 0; connection < 8; connection++) {
781  double weight = 1.0;
782  size_t linear = 0;
783  for(k = 0; k < 3; k++) {
784  int offset = (connection >> k) & 1;
785  int tmp = iCell[k] + offset;
786  linear += tmp * region->strides[k];
787  weight *= offset?
788  /* offset == 1*/ (Res[k]) :
789  /* offset == 0*/ (1 - Res[k]);
790  }
791  if(linear >= region->totalsize) {
792  endrun(1, "particle linear index out of cell better stop\n");
793  }
794  iterator(pm, i, &region->buffer[linear], weight);
795  }
796 }
797 
798 /*
799  * iterate over all particle / mesh pairs, call iterator
800  * function . iterator function shall be aware of thread safety.
801  * no threads run on same particle same time but may
802  * access one mesh points same time.
803  * */
804 static void pm_iterate(PetaPM * pm, pm_iterator iterator, PetaPMRegion * regions, const int Nregions) {
805  int i;
806 #pragma omp parallel for
807  for(i = 0; i < CPS->NumPart; i ++) {
808  pm_iterate_one(pm, i, iterator, regions, Nregions);
809  }
810  MPIU_Barrier(pm->comm);
811 }
812 
814  int k;
815  size_t rt = 1;
816  for(k = 2; k >= 0; k --) {
817  region->strides[k] = rt;
818  rt = region->size[k] * rt;
819  }
820  region->totalsize = rt;
821  region->buffer = NULL;
822 }
823 
824 static int pos_get_target(PetaPM * pm, const int pos[2]) {
825  int k;
826  int task2d[2];
827  int rank;
828  for(k = 0; k < 2; k ++) {
829  int ix = pos[k];
830  while(ix < 0) ix += pm->Nmesh;
831  while(ix >= pm->Nmesh) ix -= pm->Nmesh;
832  task2d[k] = pm->Mesh2Task[k][ix];
833  }
834  MPI_Cart_rank(pm->priv->comm_cart_2d, task2d, &rank);
835  return rank;
836 }
837 static int pencil_cmp_target(const void * v1, const void * v2) {
838  const struct Pencil * p1 = (const struct Pencil *) v1;
839  const struct Pencil * p2 = (const struct Pencil *) v2;
840  /* move zero length pixels to the end */
841  if(p2->len == 0) return -1;
842  if(p1->len == 0) return 1;
843  int t1 = p1->task;
844  int t2 = p2->task;
845  return ((t2 < t1) - (t1 < t2)) * 2 +
846  ((p2->meshbuf_first < p1->meshbuf_first) - (p1->meshbuf_first < p2->meshbuf_first));
847 }
848 
849 #ifdef DEBUG
850 static void verify_density_field(PetaPM * pm, double * real, double * meshbuf, const size_t meshsize) {
851  /* verify the density field */
852  double mass_Part = 0;
853  int j;
854 #pragma omp parallel for reduction(+: mass_Part)
855  for(j = 0; j < CPS->NumPart; j ++) {
856  double Mass = *MASS(j);
857  mass_Part += Mass;
858  }
859  double totmass_Part = 0;
860  MPI_Allreduce(&mass_Part, &totmass_Part, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
861 
862  double mass_Region = 0;
863  size_t i;
864 
865 #pragma omp parallel for reduction(+: mass_Region)
866  for(i = 0; i < meshsize; i ++) {
867  mass_Region += meshbuf[i];
868  }
869  double totmass_Region = 0;
870  MPI_Allreduce(&mass_Region, &totmass_Region, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
871  double mass_CIC = 0;
872 #pragma omp parallel for reduction(+: mass_CIC)
873  for(i = 0; i < pm->real_space_region.totalsize; i ++) {
874  mass_CIC += real[i];
875  }
876  double totmass_CIC = 0;
877  MPI_Allreduce(&mass_CIC, &totmass_CIC, 1, MPI_DOUBLE, MPI_SUM, pm->comm);
878 
879  message(0, "total Region mass err = %g CIC mass err = %g Particle mass = %g\n", totmass_Region / totmass_Part - 1, totmass_CIC / totmass_Part - 1, totmass_Part);
880 }
881 #endif
882 
884  pfft_complex * src,
885  pfft_complex * dst, petapm_transfer_func H
886  ){
887  size_t ip = 0;
888 
889  PetaPMRegion * region = &pm->fourier_space_region;
890 
891 #pragma omp parallel for
892  for(ip = 0; ip < region->totalsize; ip ++) {
893  ptrdiff_t tmp = ip;
894  int pos[3];
895  int kpos[3];
896  int64_t k2 = 0.0;
897  int k;
898  for(k = 0; k < 3; k ++) {
899  pos[k] = tmp / region->strides[k];
900  tmp -= pos[k] * region->strides[k];
901  /* lets get the abs pos on the grid*/
902  pos[k] += region->offset[k];
903  /* check */
904  if(pos[k] >= pm->Nmesh) {
905  endrun(1, "position didn't make sense\n");
906  }
907  kpos[k] = petapm_mesh_to_k(pm, pos[k]);
908  /* Watch out the cast */
909  k2 += ((int64_t)kpos[k]) * kpos[k];
910  }
911  /* swap 0 and 1 because fourier space was transposed */
912  /* kpos is y, z, x */
913  pos[0] = kpos[2];
914  pos[1] = kpos[0];
915  pos[2] = kpos[1];
916  dst[ip][0] = src[ip][0];
917  dst[ip][1] = src[ip][1];
918  if(H) {
919  H(pm, k2, pos, &dst[ip]);
920  }
921  }
922 
923 }
924 
925 
926 /**************
927  * functions iterating over particle / mesh pairs
928  ***************/
929 static void put_particle_to_mesh(PetaPM * pm, int i, double * mesh, double weight) {
930  double Mass = *MASS(i);
931  if(INACTIVE(i))
932  return;
933 #pragma omp atomic update
934  mesh[0] += weight * Mass;
935 }
936 static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
937  int64_t result = 0;
938  MPI_Allreduce(&input, &result, 1, MPI_INT64, MPI_SUM, comm);
939  return result;
940 }
941 
void message(int where, const char *fmt,...)
Definition: endrun.c:175
void endrun(int where, const char *fmt,...)
Definition: endrun.c:147
static PetaPMGlobalFunctions global_functions
Definition: glass.c:33
static PetaPMFunctions functions[]
Definition: gravpm.c:32
#define mymalloc(name, size)
Definition: mymalloc.h:15
#define mymalloc2(name, size)
Definition: mymalloc.h:16
#define myfree(x)
Definition: mymalloc.h:19
#define report_memory_usage(x)
Definition: mymalloc.h:30
void petapm_module_init(int Nthreads)
Definition: petapm.c:82
PetaPMRegion * petapm_get_real_region(PetaPM *pm)
Definition: petapm.c:67
void petapm_init(PetaPM *pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_Comm comm)
Definition: petapm.c:94
#define MASS(i)
Definition: petapm.c:61
static void pm_init_regions(PetaPM *pm, PetaPMRegion *regions, const int Nregions)
Definition: petapm.c:723
#define ROLL(a, N, j)
#define INACTIVE(i)
Definition: petapm.c:62
static void layout_iterate_cells(PetaPM *pm, struct Layout *L, cell_iterator iter, double *real)
Definition: petapm.c:682
void petapm_destroy(PetaPM *pm)
Definition: petapm.c:215
static void layout_exchange_pencils(struct Layout *L)
Definition: petapm.c:522
int petapm_mesh_to_k(PetaPM *pm, int i)
Definition: petapm.c:70
void petapm_force_finish(PetaPM *pm)
Definition: petapm.c:348
static void pm_apply_transfer_function(PetaPM *pm, pfft_complex *src, pfft_complex *dst, petapm_transfer_func H)
Definition: petapm.c:883
static void pm_iterate(PetaPM *pm, pm_iterator iterator, PetaPMRegion *regions, const int Nregions)
Definition: petapm.c:804
static void layout_build_and_exchange_cells_to_pfft(PetaPM *pm, struct Layout *L, double *meshbuf, double *real)
Definition: petapm.c:583
static void pm_iterate_one(PetaPM *pm, int i, pm_iterator iterator, PetaPMRegion *regions, const int Nregions)
Definition: petapm.c:746
pfft_complex * petapm_force_r2c(PetaPM *pm, PetaPMGlobalFunctions *global_functions)
Definition: petapm.c:273
#define POS(i)
Definition: petapm.c:60
static int pencil_cmp_target(const void *v1, const void *v2)
Definition: petapm.c:837
static void to_region(double *cell, double *region)
Definition: petapm.c:634
static void to_pfft(double *cell, double *buf)
Definition: petapm.c:577
static void layout_prepare(PetaPM *pm, struct Layout *L, double *meshbuf, PetaPMRegion *regions, const int Nregions, MPI_Comm comm)
Definition: petapm.c:375
static int64_t reduce_int64(int64_t input, MPI_Comm comm)
Definition: petapm.c:936
static MPI_Datatype MPI_PENCIL
Definition: petapm.c:46
int * petapm_get_ntask2d(PetaPM *pm)
Definition: petapm.c:77
static void layout_build_and_exchange_cells_to_local(PetaPM *pm, struct Layout *L, double *meshbuf, double *real)
Definition: petapm.c:639
void petapm_region_init_strides(PetaPMRegion *region)
Definition: petapm.c:813
void(* pm_iterator)(PetaPM *pm, int i, double *mesh, double weight)
Definition: petapm.c:227
void petapm_force_c2r(PetaPM *pm, pfft_complex *rho_k, PetaPMRegion *regions, const int Nregions, PetaPMFunctions *functions)
Definition: petapm.c:317
pfft_complex * petapm_alloc_rhok(PetaPM *pm)
Definition: petapm.c:50
static void layout_finish(struct Layout *L)
Definition: petapm.c:569
PetaPMRegion * petapm_get_fourier_region(PetaPM *pm)
Definition: petapm.c:64
static void layout_build_pencils(PetaPM *pm, struct Layout *L, double *meshbuf, PetaPMRegion *regions, const int Nregions)
Definition: petapm.c:479
static PetaPMParticleStruct * CPS
Definition: petapm.c:59
void petapm_force(PetaPM *pm, petapm_prepare_func prepare, PetaPMGlobalFunctions *global_functions, PetaPMFunctions *functions, PetaPMParticleStruct *pstruct, void *userdata)
Definition: petapm.c:353
PetaPMRegion * petapm_force_init(PetaPM *pm, petapm_prepare_func prepare, PetaPMParticleStruct *pstruct, int *Nregions, void *userdata)
Definition: petapm.c:251
void(* cell_iterator)(double *cell_value, double *comm_buffer)
Definition: petapm.c:26
static void put_particle_to_mesh(PetaPM *pm, int i, double *mesh, double weight)
Definition: petapm.c:929
static int pos_get_target(PetaPM *pm, const int pos[2])
Definition: petapm.c:824
int * petapm_get_thistask2d(PetaPM *pm)
Definition: petapm.c:74
PetaPMRegion *(* petapm_prepare_func)(PetaPM *pm, PetaPMParticleStruct *pstruct, void *data, int *Nregions)
Definition: petapm.h:91
void(* petapm_transfer_func)(PetaPM *pm, int64_t k2, int kpos[3], pfft_complex *value)
Definition: petapm.h:89
void(* petapm_readout_func)(PetaPM *pm, int i, double *mesh, double weight)
Definition: petapm.h:90
Definition: petapm.h:24
int NpExport
Definition: petapm.h:26
int * DcSend
Definition: petapm.h:39
int * ibuffer
Definition: petapm.h:44
struct Pencil * PencilSend
Definition: petapm.h:32
double * BufRecv
Definition: petapm.h:43
int NcExport
Definition: petapm.h:35
int * NpSend
Definition: petapm.h:28
int * NcRecv
Definition: petapm.h:38
int NcImport
Definition: petapm.h:36
int * NpRecv
Definition: petapm.h:29
int NpImport
Definition: petapm.h:27
int * NcSend
Definition: petapm.h:37
MPI_Comm comm
Definition: petapm.h:25
int * DcRecv
Definition: petapm.h:40
struct Pencil * PencilRecv
Definition: petapm.h:33
int * DpSend
Definition: petapm.h:30
int * DpRecv
Definition: petapm.h:31
double * BufSend
Definition: petapm.h:42
Definition: petapm.c:29
int task
Definition: petapm.c:34
int first
Definition: petapm.c:32
int len
Definition: petapm.c:31
int meshbuf_first
Definition: petapm.c:33
int offset[3]
Definition: petapm.c:30
petapm_readout_func readout
Definition: petapm.h:96
petapm_transfer_func transfer
Definition: petapm.h:95
const char * name
Definition: petapm.h:94
petapm_transfer_func global_transfer
Definition: petapm.h:104
void(* global_analysis)(PetaPM *pm)
Definition: petapm.h:103
petapm_transfer_func global_readout
Definition: petapm.h:102
int64_t NumPart
Definition: petapm.h:86
pfft_plan plan_back
Definition: petapm.h:53
struct Layout layout
Definition: petapm.h:59
int fftsize
Definition: petapm.h:51
double * meshbuf
Definition: petapm.h:57
pfft_plan plan_forw
Definition: petapm.h:52
size_t meshbufsize
Definition: petapm.h:58
MPI_Comm comm_cart_2d
Definition: petapm.h:54
Definition: petapm.h:62
int Nmesh
Definition: petapm.h:68
PetaPMPriv priv[1]
Definition: petapm.h:72
MPI_Comm comm
Definition: petapm.h:64
double CellSize
Definition: petapm.h:67
double G
Definition: petapm.h:71
double BoxSize
Definition: petapm.h:70
double Asmth
Definition: petapm.h:69
int ThisTask2d[2]
Definition: petapm.h:73
int NTask2d[2]
Definition: petapm.h:74
PetaPMRegion real_space_region
Definition: petapm.h:65
PetaPMRegion fourier_space_region
Definition: petapm.h:66
int * Mesh2Task[2]
Definition: petapm.h:75
Definition: petapm.h:7
size_t totalsize
Definition: petapm.h:12
ptrdiff_t size[3]
Definition: petapm.h:10
ptrdiff_t offset[3]
Definition: petapm.h:9
double * buffer
Definition: petapm.h:13
ptrdiff_t strides[3]
Definition: petapm.h:11
#define MPIU_Barrier(comm)
Definition: system.h:103
#define MPI_INT64
Definition: system.h:12
int ThisTask
Definition: test_exchange.c:23
int NTask
Definition: test_exchange.c:23
#define qsort_openmp
Definition: test_exchange.c:14
static const double G
Definition: test_gravity.c:35
#define walltime_measure(name)
Definition: walltime.h:8