Directory: | cvmfs/ |
---|---|
File: | cvmfs/monitor.cc |
Date: | 2025-04-20 02:34:28 |
Exec | Total | Coverage | |
---|---|---|---|
Lines: | 85 | 313 | 27.2% |
Branches: | 60 | 500 | 12.0% |
Line | Branch | Exec | Source |
---|---|---|---|
1 | /** | ||
2 | * This file is part of the CernVM File System. | ||
3 | * | ||
4 | * This module forks a watchdog process that listens on | ||
5 | * a pipe and prints a stackstrace into syslog, when cvmfs | ||
6 | * fails. | ||
7 | * | ||
8 | * Also, it handles getting and setting the maximum number of file descriptors. | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include "monitor.h" | ||
13 | |||
14 | #include <errno.h> | ||
15 | #include <execinfo.h> | ||
16 | #include <poll.h> | ||
17 | #include <pthread.h> | ||
18 | #include <signal.h> | ||
19 | #include <sys/resource.h> | ||
20 | #include <sys/types.h> | ||
21 | #ifdef __APPLE__ | ||
22 | #include <sys/ucontext.h> | ||
23 | #else | ||
24 | #include <ucontext.h> | ||
25 | #endif | ||
26 | #include <sys/uio.h> | ||
27 | #include <sys/wait.h> | ||
28 | #include <syslog.h> | ||
29 | #include <time.h> | ||
30 | #include <unistd.h> | ||
31 | |||
32 | #include <cassert> | ||
33 | #include <cstdio> | ||
34 | #include <cstdlib> | ||
35 | #include <cstring> | ||
36 | #include <map> | ||
37 | #include <set> | ||
38 | #include <string> | ||
39 | #include <vector> | ||
40 | |||
41 | #if defined(CVMFS_FUSE_MODULE) | ||
42 | #include "cvmfs.h" | ||
43 | #endif | ||
44 | #include "util/exception.h" | ||
45 | #include "util/logging.h" | ||
46 | #include "util/platform.h" | ||
47 | #include "util/posix.h" | ||
48 | #include "util/smalloc.h" | ||
49 | #include "util/string.h" | ||
50 | |||
51 | // Used for address offset calculation | ||
52 | #if defined(CVMFS_FUSE_MODULE) | ||
53 | extern loader::CvmfsExports *g_cvmfs_exports; | ||
54 | #endif | ||
55 | |||
56 | using namespace std; // NOLINT | ||
57 | |||
58 | Watchdog *Watchdog::instance_ = NULL; | ||
59 | |||
60 | int Watchdog::g_suppressed_signals[] = { SIGHUP, SIGINT, SIGQUIT, | ||
61 | SIGILL, SIGABRT, SIGBUS, | ||
62 | SIGFPE, SIGUSR1, SIGSEGV, | ||
63 | SIGUSR2, SIGTERM, SIGXCPU, | ||
64 | SIGXFSZ}; | ||
65 | |||
66 | int Watchdog::g_crash_signals[] = { SIGQUIT, SIGILL, SIGABRT, | ||
67 | SIGFPE, SIGSEGV, SIGBUS, | ||
68 | SIGPIPE, SIGXFSZ }; | ||
69 | |||
70 | 1 | Watchdog *Watchdog::Create(FnOnCrash on_crash) { | |
71 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | assert(instance_ == NULL); |
72 |
1/2✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
1 | instance_ = new Watchdog(on_crash); |
73 | 1 | instance_->Fork(); | |
74 | 1 | return instance_; | |
75 | } | ||
76 | |||
77 | |||
78 | /** | ||
79 | * Uses an external shell and gdb to create a full stack trace of the dying | ||
80 | * process. The same shell is used to force-quit the client afterwards. | ||
81 | */ | ||
82 | ✗ | string Watchdog::GenerateStackTrace(pid_t pid) { | |
83 | int retval; | ||
84 | ✗ | string result = ""; | |
85 | |||
86 | // re-gain root permissions to allow for ptrace of died cvmfs2 process | ||
87 | ✗ | const bool retrievable = true; | |
88 | ✗ | if (!SwitchCredentials(0, getgid(), retrievable)) { | |
89 | ✗ | result += "failed to re-gain root permissions... still give it a try\n"; | |
90 | } | ||
91 | |||
92 | // run gdb and attach to the dying process | ||
93 | int fd_stdin; | ||
94 | int fd_stdout; | ||
95 | int fd_stderr; | ||
96 | ✗ | vector<string> argv; | |
97 | ✗ | argv.push_back("-p"); | |
98 | ✗ | argv.push_back(StringifyInt(pid)); | |
99 | ✗ | pid_t gdb_pid = 0; | |
100 | ✗ | const bool double_fork = false; | |
101 | ✗ | retval = ExecuteBinary(&fd_stdin, | |
102 | &fd_stdout, | ||
103 | &fd_stderr, | ||
104 | #ifdef __APPLE__ | ||
105 | "lldb", | ||
106 | #else | ||
107 | "gdb", | ||
108 | #endif | ||
109 | argv, | ||
110 | double_fork, | ||
111 | &gdb_pid); | ||
112 | ✗ | assert(retval); | |
113 | |||
114 | |||
115 | // Skip the gdb startup output | ||
116 | ✗ | ReadUntilGdbPrompt(fd_stdout); | |
117 | |||
118 | // Send stacktrace command to gdb | ||
119 | #ifdef __APPLE__ | ||
120 | const string gdb_cmd = "bt all\n" "quit\n"; | ||
121 | #else | ||
122 | ✗ | const string gdb_cmd = "thread apply all bt\n" "quit\n"; | |
123 | #endif | ||
124 | // The execve can have failed, which can't be detected in ExecuteBinary. | ||
125 | // Instead, writing to the pipe will fail. | ||
126 | ✗ | ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length()); | |
127 | ✗ | if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) { | |
128 | ✗ | result += "failed to start gdb/lldb (" + StringifyInt(nbytes) + " bytes " | |
129 | ✗ | "written, errno " + StringifyInt(errno) + ")\n"; | |
130 | ✗ | return result; | |
131 | } | ||
132 | |||
133 | // Read the stack trace from the stdout of our gdb process | ||
134 | #ifdef __APPLE__ | ||
135 | // lldb has one more prompt | ||
136 | result += ReadUntilGdbPrompt(fd_stdout); | ||
137 | #endif | ||
138 | ✗ | result += ReadUntilGdbPrompt(fd_stdout) + "\n\n"; | |
139 | |||
140 | // Check for output on stderr | ||
141 | ✗ | string result_err; | |
142 | ✗ | Block2Nonblock(fd_stderr); | |
143 | char cbuf; | ||
144 | ✗ | while (read(fd_stderr, &cbuf, 1) == 1) | |
145 | ✗ | result_err.push_back(cbuf); | |
146 | ✗ | if (!result_err.empty()) | |
147 | ✗ | result += "\nError output:\n" + result_err + "\n"; | |
148 | |||
149 | // Close the connection to the terminated gdb process | ||
150 | ✗ | close(fd_stderr); | |
151 | ✗ | close(fd_stdout); | |
152 | ✗ | close(fd_stdin); | |
153 | |||
154 | // Make sure gdb has terminated (wait for it for a short while) | ||
155 | ✗ | unsigned int timeout = 15; | |
156 | int statloc; | ||
157 | ✗ | while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) { | |
158 | ✗ | --timeout; | |
159 | ✗ | SafeSleepMs(1000); | |
160 | } | ||
161 | |||
162 | // when the timeout expired, gdb probably hangs... we need to kill it | ||
163 | ✗ | if (timeout == 0) { | |
164 | ✗ | result += "gdb did not exit as expected. sending SIGKILL... "; | |
165 | ✗ | result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n"; | |
166 | } | ||
167 | |||
168 | ✗ | return result; | |
169 | } | ||
170 | |||
171 | |||
172 | ✗ | pid_t Watchdog::GetPid() { | |
173 | ✗ | if (instance_ != NULL) { | |
174 | ✗ | return instance_->watchdog_pid_; | |
175 | } | ||
176 | ✗ | return getpid(); | |
177 | } | ||
178 | |||
179 | /** | ||
180 | * Log a string to syslog and into the crash dump file. | ||
181 | * We expect ideally nothing to be logged, so that file is created on demand. | ||
182 | */ | ||
183 | ✗ | void Watchdog::LogEmergency(string msg) { | |
184 | char ctime_buffer[32]; | ||
185 | |||
186 | ✗ | if (!crash_dump_path_.empty()) { | |
187 | ✗ | FILE *fp = fopen(crash_dump_path_.c_str(), "a"); | |
188 | ✗ | if (fp) { | |
189 | ✗ | time_t now = time(NULL); | |
190 | ✗ | msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer)); | |
191 | ✗ | if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) { | |
192 | msg += | ||
193 | ✗ | " (failed to report into crash dump file " + crash_dump_path_ + ")"; | |
194 | } else { | ||
195 | ✗ | msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n"; | |
196 | } | ||
197 | ✗ | fclose(fp); | |
198 | } else { | ||
199 | ✗ | msg += " (failed to open crash dump file " + crash_dump_path_ + ")"; | |
200 | } | ||
201 | } | ||
202 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str()); | |
203 | } | ||
204 | |||
205 | /** | ||
206 | * Reads from the file descriptor until the specific gdb prompt is reached or | ||
207 | * the pipe gets broken. | ||
208 | * | ||
209 | * @param fd_pipe the file descriptor of the pipe to be read | ||
210 | * @return the data read from the pipe | ||
211 | */ | ||
212 | ✗ | string Watchdog::ReadUntilGdbPrompt(int fd_pipe) { | |
213 | #ifdef __APPLE__ | ||
214 | static const string gdb_prompt = "(lldb)"; | ||
215 | #else | ||
216 | ✗ | static const string gdb_prompt = "\n(gdb) "; | |
217 | #endif | ||
218 | |||
219 | ✗ | string result; | |
220 | char mini_buffer; | ||
221 | int chars_io; | ||
222 | ✗ | unsigned int ring_buffer_pos = 0; | |
223 | |||
224 | // read from stdout of gdb until gdb prompt occurs --> (gdb) | ||
225 | while (1) { | ||
226 | ✗ | chars_io = read(fd_pipe, &mini_buffer, 1); | |
227 | |||
228 | // in case something goes wrong... | ||
229 | ✗ | if (chars_io <= 0) break; | |
230 | |||
231 | ✗ | result += mini_buffer; | |
232 | |||
233 | // find the gdb_promt in the stdout data | ||
234 | ✗ | if (mini_buffer == gdb_prompt[ring_buffer_pos]) { | |
235 | ✗ | ++ring_buffer_pos; | |
236 | ✗ | if (ring_buffer_pos == gdb_prompt.size()) { | |
237 | ✗ | break; | |
238 | } | ||
239 | } else { | ||
240 | ✗ | ring_buffer_pos = 0; | |
241 | } | ||
242 | } | ||
243 | |||
244 | ✗ | return result; | |
245 | } | ||
246 | |||
247 | |||
248 | /** | ||
249 | * Generates useful information from the backtrace log in the pipe. | ||
250 | */ | ||
251 | ✗ | string Watchdog::ReportStacktrace() { | |
252 | CrashData crash_data; | ||
253 | ✗ | if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) { | |
254 | ✗ | return "failed to read crash data (" + StringifyInt(errno) + ")"; | |
255 | } | ||
256 | |||
257 | ✗ | string debug = "--\n"; | |
258 | ✗ | debug += "Signal: " + StringifyInt(crash_data.signal); | |
259 | ✗ | debug += ", errno: " + StringifyInt(crash_data.sys_errno); | |
260 | ✗ | debug += ", version: " + string(CVMFS_VERSION); | |
261 | ✗ | debug += ", PID: " + StringifyInt(crash_data.pid) + "\n"; | |
262 | ✗ | debug += "Executable path: " + exe_path_ + "\n"; | |
263 | |||
264 | ✗ | debug += GenerateStackTrace(crash_data.pid); | |
265 | |||
266 | // Give the dying process the finishing stroke | ||
267 | ✗ | if (kill(crash_data.pid, SIGKILL) != 0) { | |
268 | ✗ | debug += "Failed to kill cvmfs client! ("; | |
269 | ✗ | switch (errno) { | |
270 | ✗ | case EINVAL: | |
271 | ✗ | debug += "invalid signal"; | |
272 | ✗ | break; | |
273 | ✗ | case EPERM: | |
274 | ✗ | debug += "permission denied"; | |
275 | ✗ | break; | |
276 | ✗ | case ESRCH: | |
277 | ✗ | debug += "no such process"; | |
278 | ✗ | break; | |
279 | ✗ | default: | |
280 | ✗ | debug += "unknown error " + StringifyInt(errno); | |
281 | } | ||
282 | ✗ | debug += ")\n\n"; | |
283 | } | ||
284 | |||
285 | ✗ | return debug; | |
286 | } | ||
287 | |||
288 | |||
289 | ✗ | void Watchdog::ReportSignalAndTerminate( | |
290 | int sig, siginfo_t *siginfo, void * /* context */) | ||
291 | { | ||
292 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, | |
293 | "watchdog: received unexpected signal %d from PID %d / UID %d", | ||
294 | sig, siginfo->si_pid, siginfo->si_uid); | ||
295 | ✗ | _exit(1); | |
296 | } | ||
297 | |||
298 | |||
299 | ✗ | void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) { | |
300 | ✗ | int send_errno = errno; | |
301 | ✗ | if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) { | |
302 | // Concurrent call, wait for the first one to exit the process | ||
303 | ✗ | while (true) {} | |
304 | } | ||
305 | |||
306 | // Set the original signal handler for the raised signal in | ||
307 | // SIGQUIT (watchdog process will raise SIGQUIT) | ||
308 | ✗ | (void) sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL); | |
309 | |||
310 | // Inform the watchdog that CernVM-FS crashed | ||
311 | ✗ | if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) { | |
312 | ✗ | _exit(1); | |
313 | } | ||
314 | |||
315 | // Send crash information to the watchdog | ||
316 | CrashData crash_data; | ||
317 | ✗ | crash_data.signal = sig; | |
318 | ✗ | crash_data.sys_errno = send_errno; | |
319 | ✗ | crash_data.pid = getpid(); | |
320 | ✗ | if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) { | |
321 | ✗ | _exit(1); | |
322 | } | ||
323 | |||
324 | // Do not die before the stack trace was generated | ||
325 | // kill -SIGQUIT <pid> will finish this | ||
326 | ✗ | int counter = 0; | |
327 | while (true) { | ||
328 | ✗ | SafeSleepMs(100); | |
329 | // quit anyway after 30 seconds | ||
330 | ✗ | if (++counter == 300) { | |
331 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed"); | |
332 | // Last attempt to log something useful | ||
333 | #if defined(CVMFS_FUSE_MODULE) | ||
334 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", | |
335 | sig, send_errno); | ||
336 | void *addr[kMaxBacktrace]; | ||
337 | // Note: this doesn't work due to the signal stack on OS X (it works on | ||
338 | // Linux). Since anyway lldb is supposed to produce the backtrace, we | ||
339 | // consider it more important to protect cvmfs against stack overflows. | ||
340 | ✗ | int num_addr = backtrace(addr, kMaxBacktrace); | |
341 | ✗ | char **symbols = backtrace_symbols(addr, num_addr); | |
342 | ✗ | string backtrace = "Backtrace (" + StringifyInt(num_addr) + | |
343 | ✗ | " symbols):\n"; | |
344 | ✗ | for (int i = 0; i < num_addr; ++i) | |
345 | ✗ | backtrace += string(symbols[i]) + "\n"; | |
346 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str()); | |
347 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p", | |
348 | &g_cvmfs_exports); | ||
349 | #endif | ||
350 | |||
351 | ✗ | _exit(1); | |
352 | } | ||
353 | } | ||
354 | |||
355 | _exit(1); | ||
356 | } | ||
357 | |||
358 | |||
359 | /** | ||
360 | * Sets the signal handlers of the current process according to the ones | ||
361 | * defined in the given SigactionMap. | ||
362 | * | ||
363 | * @param signal_handlers a map of SIGNAL -> struct sigaction | ||
364 | * @return a SigactionMap containing the old handlers | ||
365 | */ | ||
366 | 49 | Watchdog::SigactionMap Watchdog::SetSignalHandlers( | |
367 | const SigactionMap &signal_handlers) | ||
368 | { | ||
369 | 49 | SigactionMap old_signal_handlers; | |
370 | 49 | SigactionMap::const_iterator i = signal_handlers.begin(); | |
371 | 49 | SigactionMap::const_iterator iend = signal_handlers.end(); | |
372 |
2/2✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
|
686 | for (; i != iend; ++i) { |
373 | struct sigaction old_signal_handler; | ||
374 |
1/2✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
|
637 | if (sigaction(i->first, &i->second, &old_signal_handler) != 0) { |
375 | ✗ | PANIC(NULL); | |
376 | } | ||
377 |
1/2✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
|
637 | old_signal_handlers[i->first] = old_signal_handler; |
378 | } | ||
379 | |||
380 | 98 | return old_signal_handlers; | |
381 | } | ||
382 | |||
383 | |||
384 | /** | ||
385 | * Fork the watchdog process and put it on hold until Spawn() is called. | ||
386 | */ | ||
387 | 1 | void Watchdog::Fork() { | |
388 |
1/2✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
|
1 | Pipe<kPipeWatchdogPid> pipe_pid; |
389 |
3/6✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 1 times.
✗ Branch 8 not taken.
|
1 | pipe_watchdog_ = new Pipe<kPipeWatchdog>(); |
390 |
3/6✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 1 times.
✗ Branch 8 not taken.
|
1 | pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(); |
391 | |||
392 | pid_t pid; | ||
393 | int statloc; | ||
394 |
1/3✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1 | switch (pid = fork()) { |
395 | ✗ | case -1: PANIC(NULL); | |
396 | ✗ | case 0: | |
397 | // Double fork to avoid zombie | ||
398 | ✗ | switch (fork()) { | |
399 | ✗ | case -1: _exit(1); | |
400 | ✗ | case 0: { | |
401 | ✗ | pipe_watchdog_->CloseWriteFd(); | |
402 | ✗ | Daemonize(); | |
403 | // send the watchdog PID to the supervisee | ||
404 | 49 | pid_t watchdog_pid = getpid(); | |
405 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.Write(watchdog_pid); |
406 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseWriteFd(); |
407 | // Close all unused file descriptors | ||
408 | // close also usyslog, only get it back if necessary | ||
409 | // string usyslog_save = GetLogMicroSyslog(); | ||
410 |
1/3✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
49 | string debuglog_save = GetLogDebugFile(); |
411 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogDebugFile(""); |
412 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | string usyslog_save = GetLogMicroSyslog(); |
413 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogMicroSyslog(""); |
414 | // Gracefully close the syslog before closing all fds. The next call | ||
415 | // to syslog will reopen it. | ||
416 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | closelog(); |
417 | // Let's keep stdin, stdout, stderr open at /dev/null (daemonized) | ||
418 | // in order to prevent accidental outputs from messing with another | ||
419 | // file descriptor | ||
420 | 49 | std::set<int> preserve_fds; | |
421 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(0); |
422 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(1); |
423 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(2); |
424 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_watchdog_->GetReadFd()); |
425 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_listener_->GetWriteFd()); |
426 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | CloseAllFildes(preserve_fds); |
427 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used |
428 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogDebugFile(debuglog_save); // no-op if debug log not used |
429 | |||
430 |
2/4✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
|
49 | if (WaitForSupervisee()) |
431 | ✗ | Supervise(); | |
432 | |||
433 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_watchdog_->CloseReadFd(); |
434 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_listener_->CloseWriteFd(); |
435 | 49 | exit(0); | |
436 | } | ||
437 | ✗ | default: | |
438 | ✗ | _exit(0); | |
439 | } | ||
440 | 1 | default: | |
441 |
1/2✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
1 | pipe_watchdog_->CloseReadFd(); |
442 |
1/2✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
1 | pipe_listener_->CloseWriteFd(); |
443 |
1/2✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
|
1 | pipe_pid.CloseWriteFd(); |
444 |
2/4✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
|
1 | if (waitpid(pid, &statloc, 0) != pid) PANIC(NULL); |
445 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1 | if (!WIFEXITED(statloc) || WEXITSTATUS(statloc)) PANIC(NULL); |
446 | } | ||
447 | |||
448 | // retrieve the watchdog PID from the pipe | ||
449 |
1/2✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
|
1 | pipe_pid.Read(&watchdog_pid_); |
450 |
1/2✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
|
1 | pipe_pid.CloseReadFd(); |
451 | 1 | } | |
452 | |||
453 | |||
454 | 49 | bool Watchdog::WaitForSupervisee() { | |
455 | // We want broken pipes not to raise a signal but handle the error in the | ||
456 | // read/write code | ||
457 | 49 | platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN); | |
458 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(rv_sig != SIG_ERR); |
459 | |||
460 | // The watchdog is not supposed to receive signals. If it does, report it. | ||
461 | struct sigaction sa; | ||
462 | 49 | memset(&sa, 0, sizeof(sa)); | |
463 | 49 | sa.sa_sigaction = ReportSignalAndTerminate; | |
464 | 49 | sa.sa_flags = SA_SIGINFO; | |
465 | 49 | sigfillset(&sa.sa_mask); | |
466 | |||
467 | 49 | SigactionMap signal_handlers; | |
468 |
2/2✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
|
686 | for (size_t i = 0; i < sizeof(g_suppressed_signals)/sizeof(int); i++) { |
469 |
1/2✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
|
637 | signal_handlers[g_suppressed_signals[i]] = sa; |
470 | } | ||
471 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetSignalHandlers(signal_handlers); |
472 | |||
473 | 49 | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
474 | |||
475 |
3/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
|
49 | if (!pipe_watchdog_->TryRead(&control_flow)) { |
476 |
1/2✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
|
48 | LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog"); |
477 | 48 | return false; | |
478 | } | ||
479 | |||
480 |
1/3✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
|
1 | switch (control_flow) { |
481 | 1 | case ControlFlow::kQuit: | |
482 | 1 | return false; | |
483 | ✗ | case ControlFlow::kSupervise: | |
484 | ✗ | break; | |
485 | ✗ | default: | |
486 | ✗ | LogEmergency("Internal error: invalid control flow"); | |
487 | ✗ | return false; | |
488 | } | ||
489 | |||
490 | size_t size; | ||
491 | ✗ | pipe_watchdog_->Read(&size); | |
492 | ✗ | crash_dump_path_.resize(size); | |
493 | ✗ | if (size > 0) { | |
494 | ✗ | pipe_watchdog_->Read(&crash_dump_path_[0], size); | |
495 | |||
496 | ✗ | int retval = chdir(GetParentPath(crash_dump_path_).c_str()); | |
497 | ✗ | if (retval != 0) { | |
498 | ✗ | LogEmergency(std::string("Cannot change to crash dump directory: ") + | |
499 | ✗ | crash_dump_path_); | |
500 | ✗ | return false; | |
501 | } | ||
502 | ✗ | crash_dump_path_ = GetFileName(crash_dump_path_); | |
503 | } | ||
504 | ✗ | return true; | |
505 | 49 | } | |
506 | |||
507 | /** | ||
508 | * Set up the signal handling and kick off the supervision. | ||
509 | */ | ||
510 | ✗ | void Watchdog::Spawn(const std::string &crash_dump_path) { | |
511 | // lower restrictions for ptrace | ||
512 | ✗ | if (!platform_allow_ptrace(watchdog_pid_)) { | |
513 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogWarn, | |
514 | "failed to allow ptrace() for watchdog (PID: %d). " | ||
515 | "Post crash stacktrace might not work", | ||
516 | watchdog_pid_); | ||
517 | } | ||
518 | |||
519 | // Extra stack for signal handlers | ||
520 | ✗ | int stack_size = kSignalHandlerStacksize; // 2 MB | |
521 | ✗ | sighandler_stack_.ss_sp = smalloc(stack_size); | |
522 | ✗ | sighandler_stack_.ss_size = stack_size; | |
523 | ✗ | sighandler_stack_.ss_flags = 0; | |
524 | ✗ | if (sigaltstack(&sighandler_stack_, NULL) != 0) | |
525 | ✗ | PANIC(NULL); | |
526 | |||
527 | // define our crash signal handler | ||
528 | struct sigaction sa; | ||
529 | ✗ | memset(&sa, 0, sizeof(sa)); | |
530 | ✗ | sa.sa_sigaction = SendTrace; | |
531 | ✗ | sa.sa_flags = SA_SIGINFO | SA_ONSTACK; | |
532 | ✗ | sigfillset(&sa.sa_mask); | |
533 | |||
534 | ✗ | SigactionMap signal_handlers; | |
535 | ✗ | for (size_t i = 0; i < sizeof(g_crash_signals)/sizeof(int); i++) { | |
536 | ✗ | signal_handlers[g_crash_signals[i]] = sa; | |
537 | } | ||
538 | ✗ | old_signal_handlers_ = SetSignalHandlers(signal_handlers); | |
539 | |||
540 | ✗ | pipe_terminate_ = new Pipe<kPipeThreadTerminator>(); | |
541 | int retval = | ||
542 | ✗ | pthread_create(&thread_listener_, NULL, MainWatchdogListener, this); | |
543 | ✗ | assert(retval == 0); | |
544 | |||
545 | ✗ | pipe_watchdog_->Write(ControlFlow::kSupervise); | |
546 | ✗ | size_t path_size = crash_dump_path.size(); | |
547 | ✗ | pipe_watchdog_->Write(path_size); | |
548 | ✗ | if (path_size > 0) { | |
549 | ✗ | pipe_watchdog_->Write(crash_dump_path.data(), path_size); | |
550 | } | ||
551 | |||
552 | ✗ | spawned_ = true; | |
553 | } | ||
554 | |||
555 | |||
556 | ✗ | void *Watchdog::MainWatchdogListener(void *data) { | |
557 | ✗ | Watchdog *watchdog = static_cast<Watchdog *>(data); | |
558 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener"); | |
559 | |||
560 | struct pollfd watch_fds[2]; | ||
561 | ✗ | watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd(); | |
562 | ✗ | watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents | |
563 | ✗ | watch_fds[0].revents = 0; | |
564 | ✗ | watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd(); | |
565 | ✗ | watch_fds[1].events = POLLIN | POLLPRI; | |
566 | ✗ | watch_fds[1].revents = 0; | |
567 | while (true) { | ||
568 | ✗ | int retval = poll(watch_fds, 2, -1); | |
569 | ✗ | if (retval < 0) { | |
570 | ✗ | continue; | |
571 | } | ||
572 | |||
573 | // Terminate I/O thread | ||
574 | ✗ | if (watch_fds[1].revents) | |
575 | ✗ | break; | |
576 | |||
577 | ✗ | if (watch_fds[0].revents) { | |
578 | ✗ | if ((watch_fds[0].revents & POLLERR) || | |
579 | ✗ | (watch_fds[0].revents & POLLHUP) || | |
580 | ✗ | (watch_fds[0].revents & POLLNVAL)) | |
581 | { | ||
582 | ✗ | LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr, | |
583 | "watchdog disappeared, disabling stack trace reporting " | ||
584 | "(revents: %d / %d|%d|%d)", | ||
585 | ✗ | watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL); | |
586 | ✗ | watchdog->SetSignalHandlers(watchdog->old_signal_handlers_); | |
587 | ✗ | PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting"); | |
588 | } | ||
589 | ✗ | PANIC(NULL); | |
590 | } | ||
591 | } | ||
592 | |||
593 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener"); | |
594 | ✗ | return NULL; | |
595 | } | ||
596 | |||
597 | |||
598 | ✗ | void Watchdog::Supervise() { | |
599 | ✗ | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
600 | |||
601 | ✗ | if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) { | |
602 | ✗ | LogEmergency("watchdog: unexpected termination (" + | |
603 | ✗ | StringifyInt(control_flow) + ")"); | |
604 | ✗ | if (on_crash_) on_crash_(); | |
605 | } else { | ||
606 | ✗ | switch (control_flow) { | |
607 | ✗ | case ControlFlow::kProduceStacktrace: | |
608 | ✗ | LogEmergency(ReportStacktrace()); | |
609 | ✗ | if (on_crash_) on_crash_(); | |
610 | ✗ | break; | |
611 | |||
612 | ✗ | case ControlFlow::kQuit: | |
613 | ✗ | break; | |
614 | |||
615 | ✗ | default: | |
616 | ✗ | LogEmergency("watchdog: unexpected error"); | |
617 | ✗ | break; | |
618 | } | ||
619 | } | ||
620 | } | ||
621 | |||
622 | |||
623 | 1 | Watchdog::Watchdog(FnOnCrash on_crash) | |
624 | 1 | : spawned_(false) | |
625 |
1/2✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
|
1 | , exe_path_(string(platform_getexepath())) |
626 | 1 | , watchdog_pid_(0) | |
627 |
3/6✓ Branch 3 taken 1 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 1 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 1 times.
✗ Branch 10 not taken.
|
2 | , on_crash_(on_crash) |
628 | { | ||
629 | 1 | int retval = platform_spinlock_init(&lock_handler_, 0); | |
630 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | assert(retval == 0); |
631 | 1 | memset(&sighandler_stack_, 0, sizeof(sighandler_stack_)); | |
632 | 1 | } | |
633 | |||
634 | |||
635 | 1 | Watchdog::~Watchdog() { | |
636 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (spawned_) { |
637 | // Reset signal handlers | ||
638 | ✗ | signal(SIGQUIT, SIG_DFL); | |
639 | ✗ | signal(SIGILL, SIG_DFL); | |
640 | ✗ | signal(SIGABRT, SIG_DFL); | |
641 | ✗ | signal(SIGFPE, SIG_DFL); | |
642 | ✗ | signal(SIGSEGV, SIG_DFL); | |
643 | ✗ | signal(SIGBUS, SIG_DFL); | |
644 | ✗ | signal(SIGPIPE, SIG_DFL); | |
645 | ✗ | signal(SIGXFSZ, SIG_DFL); | |
646 | ✗ | free(sighandler_stack_.ss_sp); | |
647 | ✗ | sighandler_stack_.ss_size = 0; | |
648 | |||
649 | ✗ | pipe_terminate_->Write(ControlFlow::kQuit); | |
650 | ✗ | pthread_join(thread_listener_, NULL); | |
651 | ✗ | pipe_terminate_->Close(); | |
652 | } | ||
653 | |||
654 | 1 | pipe_watchdog_->Write(ControlFlow::kQuit); | |
655 | 1 | pipe_watchdog_->CloseWriteFd(); | |
656 | 1 | pipe_listener_->CloseReadFd(); | |
657 | |||
658 | 1 | platform_spinlock_destroy(&lock_handler_); | |
659 | 1 | LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped"); | |
660 | 1 | instance_ = NULL; | |
661 | 1 | } | |
662 |