MP-Gadget  5.0.1.dev1-76bc7d4726-dirty
endrun.c
Go to the documentation of this file.
1 #include <mpi.h>
2 #include <stdio.h>
3 
4 #include <stdlib.h>
5 #include <stdarg.h>
6 #include <string.h>
7 #include <errno.h>
8 
9 #include "endrun.h"
10 #include "system.h"
11 
12 #include <unistd.h>
13 #include <sys/types.h>
14 #include <sys/wait.h>
15 #include <execinfo.h>
16 
17 /* obtain a stacktrace with exec/fork. this is signal handler safe.
18  * function based on xorg_backtrace_pstack; extracted from xorg-server/os/backtrace.c
19  *
20  * an external tool is spawn to investigate the current stack of
21  * the crashing process. there are different opinions about
22  * calling fork in a signal handlers.
23  * But we are already crashing anyways if we land in a signal.
24  *
25  * if no external tool is found we fallback to glibc's backtrace.
26  *
27  * */
28 static int
30 {
31  pid_t kidpid;
32  int pipefd[2];
33 
34  if (pipe(pipefd) != 0) {
35  return -1;
36  }
37 
38  kidpid = fork();
39 
40  if (kidpid == -1) {
41  /* ERROR */
42  return -1;
43  } else if (kidpid == 0) {
44  /* CHILD */
45  char parent[16];
46  char buf[512];
47  close(STDIN_FILENO);
48  close(STDOUT_FILENO);
49  dup2(pipefd[1],STDOUT_FILENO);
50  dup2(pipefd[1],STDERR_FILENO);
51 
52  snprintf(parent, sizeof(parent), "%d", getppid());
53 
54  /* YF: xorg didn't have the last NULL; which seems to be wrong;
55  * causing random failures in execle. */
56 
57  /* try a few tools in order; */
58  execle("/usr/bin/pstack", "pstack", parent, NULL, NULL);
59  execle("/usr/bin/eu-stack", "eu-stack", "-p", parent, NULL, NULL);
60 
61  sprintf(buf, "No tools to pretty print a stack trace for pid %d.\n"
62  "Fallback to glibc backtrace which may not contain all symbols.\n "
63  "run eu-addr2line to pretty print the output.\n", getppid());
64 
65  write(STDOUT_FILENO, buf, strlen(buf));
66  exit(EXIT_FAILURE);
67  } else {
68  /* PARENT */
69  char btline[256];
70  int kidstat = 0;
71  int bytesread;
72  int done = 0;
73 
74  close(pipefd[1]);
75 
76  while (!done) {
77  bytesread = read(pipefd[0], btline, sizeof(btline) - 1);
78 
79  if (bytesread > 0) {
80  btline[bytesread] = 0;
81  write(STDOUT_FILENO, btline, strlen(btline));
82  }
83  else if ((bytesread < 0) ||
84  ((errno != EINTR) && (errno != EAGAIN)))
85  done = 1;
86  }
87  close(pipefd[0]);
88 
89  waitpid(kidpid, &kidstat, 0);
90 
91  if (WIFEXITED(kidstat) && (WEXITSTATUS(kidstat) == EXIT_FAILURE)) {
92  void * buf[100];
93  backtrace_symbols_fd(buf, 100, STDOUT_FILENO);
94  return -1;
95  }
96  }
97  return 0;
98 }
99 
100 static int ShowBacktrace;
101 
102 static void
104 {
105  const char btline[] = "Killed by Signal %d\n";
106  char buf[128];
107  sprintf(buf, btline, no);
108  write(STDOUT_FILENO, buf, strlen(buf));
109  if(ShowBacktrace)
110  show_backtrace();
111  MPI_Abort(MPI_COMM_WORLD, no);
112 }
113 
114 static void
116 {
117  struct sigaction act, oact;
118 
119  int siglist[] = { SIGSEGV, SIGQUIT, SIGILL, SIGFPE, SIGBUS, 0};
120  sigemptyset(&act.sa_mask);
121 
122  act.sa_handler = OsSigHandler;
123  act.sa_flags = 0;
124 
125  int i;
126  for(i = 0; siglist[i] != 0; i ++) {
127  sigaction(siglist[i], &act, &oact);
128  }
129 }
130 
131 void
132 init_endrun(int backtrace)
133 {
134  ShowBacktrace = backtrace;
135  init_stacktrace();
136 }
137 
138 /* This function aborts the simulation.
139  *
140  * if where > 0, a stacktrace is printed per rank calling endrun.
141  * if where <= 0, the function shall be called by all ranks collectively.
142  * and only the root rank prints the error.
143  *
144  * No barrier is applied.
145  */
146 void
147 endrun(int where, const char * fmt, ...)
148 {
149 
150  va_list va;
151  va_start(va, fmt);
152  MPIU_Tracev(MPI_COMM_WORLD, where, 1, fmt, va);
153  va_end(va);
154  int ThisTask;
155  MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
156  if(ThisTask == 0 || where > 0) {
157  if(ShowBacktrace)
158  show_backtrace();
159  MPI_Abort(MPI_COMM_WORLD, where);
160  }
161  /* This is here so the compiler knows this
162  * function never returns. */
163  exit(1);
164 }
165 
166 
167 /* This function writes a message.
168  *
169  * if where > 0, the message is uncollective.
170  * if where <= 0, the message is 'collective', only the root rank prints the message.
171  *
172  * No barrier is applied.
173  */
174 
175 void message(int where, const char * fmt, ...)
176 {
177  va_list va;
178  va_start(va, fmt);
179  MPIU_Tracev(MPI_COMM_WORLD, where, 0, fmt, va);
180  va_end(va);
181 }
182 
static void init_stacktrace(void)
Definition: endrun.c:115
static int show_backtrace(void)
Definition: endrun.c:29
void message(int where, const char *fmt,...)
Definition: endrun.c:175
void init_endrun(int backtrace)
Definition: endrun.c:132
static void OsSigHandler(int no)
Definition: endrun.c:103
static int ShowBacktrace
Definition: endrun.c:100
void endrun(int where, const char *fmt,...)
Definition: endrun.c:147
void MPIU_Tracev(MPI_Comm comm, int where, int error, const char *fmt, va_list va)
Definition: system.c:149
int ThisTask
Definition: test_exchange.c:23