MP-Gadget  5.0.1.dev1-76bc7d4726-dirty
hci.c
Go to the documentation of this file.
1 #include <mpi.h>
2 
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <unistd.h>
7 #include "utils.h"
8 #include "hci.h"
9 
10 static double
12 {
13  double e;
14  if(manager->OVERRIDE_NOW) {
15  e = manager->_now;
16  } else {
17  e = MPI_Wtime();
18  }
19  /* must be consistent between all ranks. */
20  MPI_Bcast(&e, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
21  return e;
22 }
23 
24 void
25 hci_init(HCIManager * manager, char * prefix, double WallClockTimeLimit, double AutoCheckPointTime, int FOFEnabled)
26 {
30 
31  manager->WallClockTimeLimit = WallClockTimeLimit;
32  manager->AutoCheckPointTime = AutoCheckPointTime;
33  manager->FOFEnabled = FOFEnabled;
35 }
36 
37 void
39 {
40  action->type = HCI_NO_ACTION;
41  action->write_snapshot = 0;
42  action->write_fof = 0;
43 }
44 
45 /* override the result of hci_now; for unit testing -- we can't rely on MPI_Wtime there!
46  * this function can be called before hci_init. */
47 void
49 {
50  manager->_now = now;
51  manager->OVERRIDE_NOW = 1;
52 }
53 
54 static double
56 {
57  double e = hci_now(manager) - manager->timer_begin;
58  return e;
59 }
60 
61 static
63 {
64  double e = hci_now(manager);
65  double g = e - manager->timer_query_begin;
68 
70 }
71 
72 /*
73  * query the filesystem for HCI commands;
74  * returns the content of the file or NULL; collectively
75  * */
76 int
77 hci_query_filesystem(HCIManager * manager, const char * filename, char ** request)
78 {
79  int ThisTask;
80  int NTask;
81  MPI_Comm_size(MPI_COMM_WORLD, &NTask);
82  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
83  int size = 0;
84  char * content = NULL;
85  if(ThisTask == 0) {
86  char * fullname = fastpm_strdup_printf("%s/%s", manager->prefix, filename);
87  content = fastpm_file_get_content(fullname);
88  if(content) {
89  size = strlen(content);
90  remove(fullname);
91  } else {
92  size = -1;
93  }
94  myfree(fullname);
95  }
96  MPI_Bcast(&size, 1, MPI_INT, 0, MPI_COMM_WORLD);
97 
98  if(size != -1) {
99  if(ThisTask != 0) {
100  content = ta_malloc("hcicontent", char, size + 1);
101  }
102  MPI_Bcast(content, size+1, MPI_BYTE, 0, MPI_COMM_WORLD);
103  } else {
104  content = NULL;
105  }
106 
107  *request = content;
108  return *request != NULL;
109 }
110 
111 static int
113 {
114  /* this function is collective because we take care to ensure manager is
115  * collective */
116  double now = hci_get_elapsed_time(manager);
117  /*
118  * factor 0.9 is a safety tolerance
119  * for possible inconsistency between measured time and the true wallclock
120  *
121  * If there likely isn't time for a new query, then we shall timeout as well.
122  * */
123 
124  *request = NULL;
126  return 0;
127  }
128 
129  /* any freeable string would work. */
130  return 1;
131 }
132 
133 static int
135 {
136  /* this function is collective because we take care to ensure manager is
137  * collective */
138  if(manager->AutoCheckPointTime <= 0) return 0;
139 
140  /* How long since the last checkpoint? */
141  double now = hci_get_elapsed_time(manager);
143  return 1;
144  }
145  return 0;
146 }
147 
148 /*
149  * the return value is non-zero if the mainloop shall break.
150  * */
151 int
153 {
154  hci_action_init(action);
155 
156  /* measure time since last query */
158 
159  /* Check whether we need to interrupt the run */
160 
161  char * request;
162 
163  /* Will we run out of time by the query ? highest priority.
164  */
165  if(hci_query_timeout(manager, &request)) {
166  message(0, "HCI: Stopping due to TimeLimitCPU, dumping a CheckPoint.\n");
167  action->type = HCI_TIMEOUT;
168  action->write_snapshot = 1;
169  if(manager->FOFEnabled)
170  action->write_fof = 1;
171  return 1;
172  }
173 
174  if(hci_query_filesystem(manager, "reconfigure", &request))
175  {
176  /* FIXME: This is not implemented
177  * it shall reread the configuration file and update the parameters of
178  * the module listed in the request.
179  * see the comment about update_IO_params
180  * */
181  message(0, "HCI: updating io parameters, this is not supported yet.\n");
182  myfree(request);
183  return 0;
184  }
185 
186  if(hci_query_filesystem(manager, "checkpoint", &request))
187  {
188  message(0, "HCI: human controlled stop with checkpoint at next PM.\n");
189  action->type = HCI_CHECKPOINT;
190  /* will write checkpoint in this PM timestep */
191  action->write_snapshot = 1;
192  /* Write fof as well*/
193  if(manager->FOFEnabled)
194  action->write_fof = 1;
195  myfree(request);
197  return 0;
198  }
199 
200  /* Is the stop-file present? If yes, interrupt the run with a snapshot. */
201  if(hci_query_filesystem(manager, "stop", &request))
202  {
203  /* will write checkpoint in this PM timestep, then stop */
204  action->type = HCI_STOP;
205  action->write_snapshot = 1;
206  myfree(request);
207  return 1;
208  }
209 
210  /* Is the terminate-file present? If yes, interrupt the run immediately. */
211  if(hci_query_filesystem(manager, "terminate", &request))
212  {
213  message(0, "HCI: human triggered termination.\n");
214  /* the caller shall take care of immediate termination.
215  * This action is better than KILL as it avoids corrupt/incomplete snapshot files.*/
216  action->type = HCI_TERMINATE;
217  action->write_snapshot = 0;
218  myfree(request);
219  return 1;
220  }
221 
222  /* lower priority */
223  if(hci_query_auto_checkpoint(manager, &request))
224  {
225  message(0, "HCI: Auto checkpoint due to AutoCheckPointTime.\n");
226  action->type = HCI_AUTO_CHECKPOINT;
227  /* Write when the PM timestep completes*/
228  action->write_snapshot = 1;
229  if(manager->FOFEnabled)
230  action->write_fof = 1;
232  return 0;
233  }
234 
235  message(0, "HCI: Nothing happened. \n");
236  return 0;
237 }
238 
239 /*
240  * FIXME: rewrite update_IO_params with
241  * the parser infrastructure. It probably shall occur
242  * after we decentralize the initialization of the parser
243  * to different modules.
244  * */
245 
246 #if 0
247 static void
248 update_IO_params(const char * ioctlfname)
249 {
250  if(ThisTask == 0) {
251  FILE * fd = fopen(ioctlfname, "r");
252  /* there is an ioctl file, parse it and update
253  * All.NumPartPerFile
254  * All.NumWriters
255  */
256  size_t n = 0;
257  char * line = NULL;
258  while(-1 != getline(&line, &n, fd)) {
259  sscanf(line, "BytesPerFile %lu", &All.IO.BytesPerFile);
260  sscanf(line, "NumWriters %d", &All.IO.NumWriters);
261  }
262  myfree(line);
263  fclose(fd);
264  }
265 
266  MPI_Bcast(&All.IO, sizeof(All.IO), MPI_BYTE, 0, MPI_COMM_WORLD);
267  message(0, "New IO parameter recieved from %s:"
268  "NumPartPerfile %d"
269  "NumWriters %d\n",
270  ioctlfname,
271  All.IO.BytesPerFile,
272  All.IO.NumWriters);
273 }
274 #endif
void message(int where, const char *fmt,...)
Definition: endrun.c:175
static void hci_update_query_timer(HCIManager *manager)
Definition: hci.c:62
static double hci_now(HCIManager *manager)
Definition: hci.c:11
static double hci_get_elapsed_time(HCIManager *manager)
Definition: hci.c:55
static int hci_query_timeout(HCIManager *manager, char **request)
Definition: hci.c:112
int hci_query_filesystem(HCIManager *manager, const char *filename, char **request)
Definition: hci.c:77
void hci_init(HCIManager *manager, char *prefix, double WallClockTimeLimit, double AutoCheckPointTime, int FOFEnabled)
Definition: hci.c:25
int hci_query(HCIManager *manager, HCIAction *action)
Definition: hci.c:152
void hci_override_now(HCIManager *manager, double now)
Definition: hci.c:48
static int hci_query_auto_checkpoint(HCIManager *manager, char **request)
Definition: hci.c:134
void hci_action_init(HCIAction *action)
Definition: hci.c:38
@ HCI_TERMINATE
Definition: hci.h:26
@ HCI_AUTO_CHECKPOINT
Definition: hci.h:24
@ HCI_CHECKPOINT
Definition: hci.h:25
@ HCI_TIMEOUT
Definition: hci.h:23
@ HCI_NO_ACTION
Definition: hci.h:21
@ HCI_STOP
Definition: hci.h:22
#define ta_malloc(name, type, nele)
Definition: mymalloc.h:25
#define myfree(x)
Definition: mymalloc.h:19
static struct run_params All
char * fastpm_file_get_content(const char *filename)
Definition: string.c:14
char * fastpm_strdup_printf(const char *fmt,...)
Definition: string.c:41
Definition: hci.h:31
enum HCIActionType type
Definition: hci.h:32
int write_snapshot
Definition: hci.h:33
int write_fof
Definition: hci.h:34
Definition: hci.h:4
double timer_query_begin
Definition: hci.h:12
char * prefix
Definition: hci.h:6
double timer_begin
Definition: hci.h:13
double AutoCheckPointTime
Definition: hci.h:9
double WallClockTimeLimit
Definition: hci.h:11
double TimeLastCheckPoint
Definition: hci.h:8
double LongestTimeBetweenQueries
Definition: hci.h:10
double _now
Definition: hci.h:17
int FOFEnabled
Definition: hci.h:7
int OVERRIDE_NOW
Definition: hci.h:16
int ThisTask
Definition: test_exchange.c:23
int NTask
Definition: test_exchange.c:23
char prefix[1024]
Definition: test_hci.c:13
HCIManager manager[1]
Definition: test_hci.c:37