Directory: | cvmfs/ |
---|---|
File: | cvmfs/monitor.cc |
Date: | 2025-06-22 02:36:02 |
Exec | Total | Coverage | |
---|---|---|---|
Lines: | 85 | 320 | 26.6% |
Branches: | 60 | 500 | 12.0% |
Line | Branch | Exec | Source |
---|---|---|---|
1 | /** | ||
2 | * This file is part of the CernVM File System. | ||
3 | * | ||
4 | * This module forks a watchdog process that listens on | ||
5 | * a pipe and prints a stackstrace into syslog, when cvmfs | ||
6 | * fails. | ||
7 | * | ||
8 | * Also, it handles getting and setting the maximum number of file descriptors. | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include "monitor.h" | ||
13 | |||
14 | #include <errno.h> | ||
15 | #include <execinfo.h> | ||
16 | #include <poll.h> | ||
17 | #include <pthread.h> | ||
18 | #include <signal.h> | ||
19 | #include <sys/resource.h> | ||
20 | #include <sys/types.h> | ||
21 | #ifdef __APPLE__ | ||
22 | #include <sys/ucontext.h> | ||
23 | #else | ||
24 | #include <ucontext.h> | ||
25 | #endif | ||
26 | #include <sys/uio.h> | ||
27 | #include <sys/wait.h> | ||
28 | #include <syslog.h> | ||
29 | #include <time.h> | ||
30 | #include <unistd.h> | ||
31 | |||
32 | #include <cassert> | ||
33 | #include <cstdio> | ||
34 | #include <cstdlib> | ||
35 | #include <cstring> | ||
36 | #include <map> | ||
37 | #include <set> | ||
38 | #include <string> | ||
39 | #include <vector> | ||
40 | |||
41 | #if defined(CVMFS_FUSE_MODULE) | ||
42 | #include "cvmfs.h" | ||
43 | #endif | ||
44 | #include "util/exception.h" | ||
45 | #include "util/logging.h" | ||
46 | #include "util/platform.h" | ||
47 | #include "util/posix.h" | ||
48 | #include "util/smalloc.h" | ||
49 | #include "util/string.h" | ||
50 | |||
51 | // Used for address offset calculation | ||
52 | #if defined(CVMFS_FUSE_MODULE) | ||
53 | extern loader::CvmfsExports *g_cvmfs_exports; | ||
54 | #endif | ||
55 | |||
56 | using namespace std; // NOLINT | ||
57 | |||
58 | Watchdog *Watchdog::instance_ = NULL; | ||
59 | |||
60 | int Watchdog::g_suppressed_signals[] = { | ||
61 | SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGBUS, SIGFPE, | ||
62 | SIGUSR1, SIGSEGV, SIGUSR2, SIGTERM, SIGXCPU, SIGXFSZ}; | ||
63 | |||
64 | int Watchdog::g_crash_signals[] = {SIGQUIT, SIGILL, SIGABRT, SIGFPE, | ||
65 | SIGSEGV, SIGBUS, SIGPIPE, SIGXFSZ}; | ||
66 | |||
67 | 49 | Watchdog *Watchdog::Create(FnOnCrash on_crash) { | |
68 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(instance_ == NULL); |
69 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | instance_ = new Watchdog(on_crash); |
70 | 49 | instance_->Fork(); | |
71 | 49 | return instance_; | |
72 | } | ||
73 | |||
74 | |||
75 | /** | ||
76 | * Uses an external shell and gdb to create a full stack trace of the dying | ||
77 | * process. The same shell is used to force-quit the client afterwards. | ||
78 | */ | ||
79 | ✗ | string Watchdog::GenerateStackTrace(pid_t pid) { | |
80 | int retval; | ||
81 | ✗ | string result = ""; | |
82 | |||
83 | // re-gain root permissions to allow for ptrace of died cvmfs2 process | ||
84 | ✗ | const bool retrievable = true; | |
85 | ✗ | if (!SwitchCredentials(0, getgid(), retrievable)) { | |
86 | ✗ | result += "failed to re-gain root permissions... still give it a try\n"; | |
87 | } | ||
88 | |||
89 | // run gdb and attach to the dying process | ||
90 | int fd_stdin; | ||
91 | int fd_stdout; | ||
92 | int fd_stderr; | ||
93 | ✗ | vector<string> argv; | |
94 | ✗ | argv.push_back("-p"); | |
95 | ✗ | argv.push_back(StringifyInt(pid)); | |
96 | ✗ | pid_t gdb_pid = 0; | |
97 | ✗ | const bool double_fork = false; | |
98 | ✗ | retval = ExecuteBinary(&fd_stdin, | |
99 | &fd_stdout, | ||
100 | &fd_stderr, | ||
101 | #ifdef __APPLE__ | ||
102 | "lldb", | ||
103 | #else | ||
104 | "gdb", | ||
105 | #endif | ||
106 | argv, | ||
107 | double_fork, | ||
108 | &gdb_pid); | ||
109 | ✗ | assert(retval); | |
110 | |||
111 | |||
112 | // Skip the gdb startup output | ||
113 | ✗ | ReadUntilGdbPrompt(fd_stdout); | |
114 | |||
115 | // Send stacktrace command to gdb | ||
116 | #ifdef __APPLE__ | ||
117 | const string gdb_cmd = "bt all\n" | ||
118 | "quit\n"; | ||
119 | #else | ||
120 | const string gdb_cmd = "thread apply all bt\n" | ||
121 | ✗ | "quit\n"; | |
122 | #endif | ||
123 | // The execve can have failed, which can't be detected in ExecuteBinary. | ||
124 | // Instead, writing to the pipe will fail. | ||
125 | ✗ | const ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length()); | |
126 | ✗ | if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) { | |
127 | ✗ | result += "failed to start gdb/lldb (" + StringifyInt(nbytes) | |
128 | ✗ | + " bytes " | |
129 | "written, errno " | ||
130 | ✗ | + StringifyInt(errno) + ")\n"; | |
131 | ✗ | return result; | |
132 | } | ||
133 | |||
134 | // Read the stack trace from the stdout of our gdb process | ||
135 | #ifdef __APPLE__ | ||
136 | // lldb has one more prompt | ||
137 | result += ReadUntilGdbPrompt(fd_stdout); | ||
138 | #endif | ||
139 | ✗ | result += ReadUntilGdbPrompt(fd_stdout) + "\n\n"; | |
140 | |||
141 | // Check for output on stderr | ||
142 | ✗ | string result_err; | |
143 | ✗ | Block2Nonblock(fd_stderr); | |
144 | char cbuf; | ||
145 | ✗ | while (read(fd_stderr, &cbuf, 1) == 1) | |
146 | ✗ | result_err.push_back(cbuf); | |
147 | ✗ | if (!result_err.empty()) | |
148 | ✗ | result += "\nError output:\n" + result_err + "\n"; | |
149 | |||
150 | // Close the connection to the terminated gdb process | ||
151 | ✗ | close(fd_stderr); | |
152 | ✗ | close(fd_stdout); | |
153 | ✗ | close(fd_stdin); | |
154 | |||
155 | // Make sure gdb has terminated (wait for it for a short while) | ||
156 | ✗ | unsigned int timeout = 15; | |
157 | int statloc; | ||
158 | ✗ | while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) { | |
159 | ✗ | --timeout; | |
160 | ✗ | SafeSleepMs(1000); | |
161 | } | ||
162 | |||
163 | // when the timeout expired, gdb probably hangs... we need to kill it | ||
164 | ✗ | if (timeout == 0) { | |
165 | ✗ | result += "gdb did not exit as expected. sending SIGKILL... "; | |
166 | ✗ | result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n"; | |
167 | } | ||
168 | |||
169 | ✗ | return result; | |
170 | } | ||
171 | |||
172 | |||
173 | ✗ | pid_t Watchdog::GetPid() { | |
174 | ✗ | if (instance_ != NULL) { | |
175 | ✗ | return instance_->watchdog_pid_; | |
176 | } | ||
177 | ✗ | return getpid(); | |
178 | } | ||
179 | |||
180 | /** | ||
181 | * Log a string to syslog and into the crash dump file. | ||
182 | * We expect ideally nothing to be logged, so that file is created on demand. | ||
183 | */ | ||
184 | ✗ | void Watchdog::LogEmergency(string msg) { | |
185 | char ctime_buffer[32]; | ||
186 | |||
187 | ✗ | if (!crash_dump_path_.empty()) { | |
188 | ✗ | FILE *fp = fopen(crash_dump_path_.c_str(), "a"); | |
189 | ✗ | if (fp) { | |
190 | ✗ | const time_t now = time(NULL); | |
191 | ✗ | msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer)); | |
192 | ✗ | if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) { | |
193 | ✗ | msg += " (failed to report into crash dump file " + crash_dump_path_ | |
194 | ✗ | + ")"; | |
195 | } else { | ||
196 | ✗ | msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n"; | |
197 | } | ||
198 | ✗ | fclose(fp); | |
199 | } else { | ||
200 | ✗ | msg += " (failed to open crash dump file " + crash_dump_path_ + ")"; | |
201 | } | ||
202 | } | ||
203 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str()); | |
204 | } | ||
205 | |||
206 | /** | ||
207 | * Reads from the file descriptor until the specific gdb prompt is reached or | ||
208 | * the pipe gets broken. | ||
209 | * | ||
210 | * @param fd_pipe the file descriptor of the pipe to be read | ||
211 | * @return the data read from the pipe | ||
212 | */ | ||
213 | ✗ | string Watchdog::ReadUntilGdbPrompt(int fd_pipe) { | |
214 | #ifdef __APPLE__ | ||
215 | static const string gdb_prompt = "(lldb)"; | ||
216 | #else | ||
217 | ✗ | static const string gdb_prompt = "\n(gdb) "; | |
218 | #endif | ||
219 | |||
220 | ✗ | string result; | |
221 | char mini_buffer; | ||
222 | int chars_io; | ||
223 | ✗ | unsigned int ring_buffer_pos = 0; | |
224 | |||
225 | // read from stdout of gdb until gdb prompt occurs --> (gdb) | ||
226 | while (1) { | ||
227 | ✗ | chars_io = read(fd_pipe, &mini_buffer, 1); | |
228 | |||
229 | // in case something goes wrong... | ||
230 | ✗ | if (chars_io <= 0) | |
231 | ✗ | break; | |
232 | |||
233 | ✗ | result += mini_buffer; | |
234 | |||
235 | // find the gdb_promt in the stdout data | ||
236 | ✗ | if (mini_buffer == gdb_prompt[ring_buffer_pos]) { | |
237 | ✗ | ++ring_buffer_pos; | |
238 | ✗ | if (ring_buffer_pos == gdb_prompt.size()) { | |
239 | ✗ | break; | |
240 | } | ||
241 | } else { | ||
242 | ✗ | ring_buffer_pos = 0; | |
243 | } | ||
244 | } | ||
245 | |||
246 | ✗ | return result; | |
247 | } | ||
248 | |||
249 | |||
250 | /** | ||
251 | * Generates useful information from the backtrace log in the pipe. | ||
252 | */ | ||
253 | ✗ | string Watchdog::ReportStacktrace() { | |
254 | CrashData crash_data; | ||
255 | ✗ | if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) { | |
256 | ✗ | return "failed to read crash data (" + StringifyInt(errno) + ")"; | |
257 | } | ||
258 | |||
259 | ✗ | string debug = "--\n"; | |
260 | ✗ | debug += "Signal: " + StringifyInt(crash_data.signal); | |
261 | ✗ | debug += ", errno: " + StringifyInt(crash_data.sys_errno); | |
262 | ✗ | debug += ", version: " + string(CVMFS_VERSION); | |
263 | ✗ | debug += ", PID: " + StringifyInt(crash_data.pid) + "\n"; | |
264 | ✗ | debug += "Executable path: " + exe_path_ + "\n"; | |
265 | |||
266 | ✗ | debug += GenerateStackTrace(crash_data.pid); | |
267 | |||
268 | // Give the dying process the finishing stroke | ||
269 | ✗ | if (kill(crash_data.pid, SIGKILL) != 0) { | |
270 | ✗ | debug += "Failed to kill cvmfs client! ("; | |
271 | ✗ | switch (errno) { | |
272 | ✗ | case EINVAL: | |
273 | ✗ | debug += "invalid signal"; | |
274 | ✗ | break; | |
275 | ✗ | case EPERM: | |
276 | ✗ | debug += "permission denied"; | |
277 | ✗ | break; | |
278 | ✗ | case ESRCH: | |
279 | ✗ | debug += "no such process"; | |
280 | ✗ | break; | |
281 | ✗ | default: | |
282 | ✗ | debug += "unknown error " + StringifyInt(errno); | |
283 | } | ||
284 | ✗ | debug += ")\n\n"; | |
285 | } | ||
286 | |||
287 | ✗ | return debug; | |
288 | } | ||
289 | |||
290 | |||
291 | ✗ | void Watchdog::ReportSignalAndContinue(int sig, siginfo_t *siginfo, | |
292 | void * /* context */) { | ||
293 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, | |
294 | "watchdog: received unexpected signal %d from PID %d / UID %d", sig, | ||
295 | siginfo->si_pid, siginfo->si_uid); | ||
296 | } | ||
297 | |||
298 | |||
299 | ✗ | void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) { | |
300 | ✗ | const int send_errno = errno; | |
301 | ✗ | if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) { | |
302 | // Concurrent call, wait for the first one to exit the process | ||
303 | ✗ | while (true) { | |
304 | } | ||
305 | } | ||
306 | |||
307 | // Set the original signal handler for the raised signal in | ||
308 | // SIGQUIT (watchdog process will raise SIGQUIT) | ||
309 | ✗ | (void)sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL); | |
310 | |||
311 | // Inform the watchdog that CernVM-FS crashed | ||
312 | ✗ | if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) { | |
313 | ✗ | _exit(1); | |
314 | } | ||
315 | |||
316 | // Send crash information to the watchdog | ||
317 | CrashData crash_data; | ||
318 | ✗ | crash_data.signal = sig; | |
319 | ✗ | crash_data.sys_errno = send_errno; | |
320 | ✗ | crash_data.pid = getpid(); | |
321 | ✗ | if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) { | |
322 | ✗ | _exit(1); | |
323 | } | ||
324 | |||
325 | // Do not die before the stack trace was generated | ||
326 | // kill -SIGQUIT <pid> will finish this | ||
327 | ✗ | int counter = 0; | |
328 | while (true) { | ||
329 | ✗ | SafeSleepMs(100); | |
330 | // quit anyway after 30 seconds | ||
331 | ✗ | if (++counter == 300) { | |
332 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed"); | |
333 | // Last attempt to log something useful | ||
334 | #if defined(CVMFS_FUSE_MODULE) | ||
335 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", sig, | |
336 | send_errno); | ||
337 | void *addr[kMaxBacktrace]; | ||
338 | // Note: this doesn't work due to the signal stack on OS X (it works on | ||
339 | // Linux). Since anyway lldb is supposed to produce the backtrace, we | ||
340 | // consider it more important to protect cvmfs against stack overflows. | ||
341 | ✗ | const int num_addr = backtrace(addr, kMaxBacktrace); | |
342 | ✗ | char **symbols = backtrace_symbols(addr, num_addr); | |
343 | ✗ | string backtrace = "Backtrace (" + StringifyInt(num_addr) | |
344 | ✗ | + " symbols):\n"; | |
345 | ✗ | for (int i = 0; i < num_addr; ++i) | |
346 | ✗ | backtrace += string(symbols[i]) + "\n"; | |
347 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str()); | |
348 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p", | |
349 | &g_cvmfs_exports); | ||
350 | #endif | ||
351 | |||
352 | ✗ | _exit(1); | |
353 | } | ||
354 | } | ||
355 | |||
356 | _exit(1); | ||
357 | } | ||
358 | |||
359 | |||
360 | /** | ||
361 | * Sets the signal handlers of the current process according to the ones | ||
362 | * defined in the given SigactionMap. | ||
363 | * | ||
364 | * @param signal_handlers a map of SIGNAL -> struct sigaction | ||
365 | * @return a SigactionMap containing the old handlers | ||
366 | */ | ||
367 | 49 | Watchdog::SigactionMap Watchdog::SetSignalHandlers( | |
368 | const SigactionMap &signal_handlers) { | ||
369 | 49 | SigactionMap old_signal_handlers; | |
370 | 49 | SigactionMap::const_iterator i = signal_handlers.begin(); | |
371 | 49 | const SigactionMap::const_iterator iend = signal_handlers.end(); | |
372 |
2/2✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
|
686 | for (; i != iend; ++i) { |
373 | struct sigaction old_signal_handler; | ||
374 |
1/2✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
|
637 | if (sigaction(i->first, &i->second, &old_signal_handler) != 0) { |
375 | ✗ | PANIC(NULL); | |
376 | } | ||
377 |
1/2✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
|
637 | old_signal_handlers[i->first] = old_signal_handler; |
378 | } | ||
379 | |||
380 | 98 | return old_signal_handlers; | |
381 | } | ||
382 | |||
383 | |||
384 | /** | ||
385 | * Fork the watchdog process and put it on hold until Spawn() is called. | ||
386 | */ | ||
387 | 49 | void Watchdog::Fork() { | |
388 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | Pipe<kPipeWatchdogPid> pipe_pid; |
389 |
3/6✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
|
49 | pipe_watchdog_ = new Pipe<kPipeWatchdog>(); |
390 |
3/6✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
|
49 | pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(); |
391 | |||
392 | pid_t pid; | ||
393 | int statloc; | ||
394 |
1/3✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
|
49 | switch (pid = fork()) { |
395 | ✗ | case -1: | |
396 | ✗ | PANIC(NULL); | |
397 | ✗ | case 0: | |
398 | // Double fork to avoid zombie | ||
399 | ✗ | switch (fork()) { | |
400 | ✗ | case -1: | |
401 | ✗ | _exit(1); | |
402 | ✗ | case 0: { | |
403 | ✗ | pipe_watchdog_->CloseWriteFd(); | |
404 | ✗ | Daemonize(); | |
405 | // send the watchdog PID to the supervisee | ||
406 | 49 | const pid_t watchdog_pid = getpid(); | |
407 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.Write(watchdog_pid); |
408 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseWriteFd(); |
409 | // Close all unused file descriptors | ||
410 | // close also usyslog, only get it back if necessary | ||
411 | // string usyslog_save = GetLogMicroSyslog(); | ||
412 |
1/3✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
49 | const string debuglog_save = GetLogDebugFile(); |
413 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogDebugFile(""); |
414 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | const string usyslog_save = GetLogMicroSyslog(); |
415 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogMicroSyslog(""); |
416 | // Gracefully close the syslog before closing all fds. The next call | ||
417 | // to syslog will reopen it. | ||
418 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | closelog(); |
419 | // Let's keep stdin, stdout, stderr open at /dev/null (daemonized) | ||
420 | // in order to prevent accidental outputs from messing with another | ||
421 | // file descriptor | ||
422 | 49 | std::set<int> preserve_fds; | |
423 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(0); |
424 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(1); |
425 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(2); |
426 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_watchdog_->GetReadFd()); |
427 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_listener_->GetWriteFd()); |
428 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | CloseAllFildes(preserve_fds); |
429 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used |
430 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogDebugFile(debuglog_save); // no-op if debug log not used |
431 | |||
432 |
2/4✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
|
49 | if (WaitForSupervisee()) |
433 | ✗ | Supervise(); | |
434 | |||
435 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_watchdog_->CloseReadFd(); |
436 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_listener_->CloseWriteFd(); |
437 | 49 | exit(0); | |
438 | } | ||
439 | ✗ | default: | |
440 | ✗ | _exit(0); | |
441 | } | ||
442 | 49 | default: | |
443 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_watchdog_->CloseReadFd(); |
444 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_listener_->CloseWriteFd(); |
445 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseWriteFd(); |
446 |
2/4✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
|
49 | if (waitpid(pid, &statloc, 0) != pid) |
447 | ✗ | PANIC(NULL); | |
448 |
2/4✓ Branch 0 taken 49 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
|
49 | if (!WIFEXITED(statloc) || WEXITSTATUS(statloc)) |
449 | ✗ | PANIC(NULL); | |
450 | } | ||
451 | |||
452 | // retrieve the watchdog PID from the pipe | ||
453 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.Read(&watchdog_pid_); |
454 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseReadFd(); |
455 | 49 | } | |
456 | |||
457 | |||
458 | 49 | bool Watchdog::WaitForSupervisee() { | |
459 | // We want broken pipes not to raise a signal but handle the error in the | ||
460 | // read/write code | ||
461 | 49 | platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN); | |
462 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(rv_sig != SIG_ERR); |
463 | |||
464 | // The watchdog is not supposed to receive signals. If it does, report it. | ||
465 | struct sigaction sa; | ||
466 | 49 | memset(&sa, 0, sizeof(sa)); | |
467 | 49 | sa.sa_sigaction = ReportSignalAndContinue; | |
468 | 49 | sa.sa_flags = SA_SIGINFO; | |
469 | 49 | sigfillset(&sa.sa_mask); | |
470 | |||
471 | 49 | SigactionMap signal_handlers; | |
472 |
2/2✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
|
686 | for (size_t i = 0; i < sizeof(g_suppressed_signals) / sizeof(int); i++) { |
473 |
1/2✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
|
637 | signal_handlers[g_suppressed_signals[i]] = sa; |
474 | } | ||
475 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetSignalHandlers(signal_handlers); |
476 | |||
477 | 49 | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
478 | |||
479 |
3/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
|
49 | if (!pipe_watchdog_->TryRead(&control_flow)) { |
480 |
1/2✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
|
48 | LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog"); |
481 | 48 | return false; | |
482 | } | ||
483 | |||
484 |
1/3✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
|
1 | switch (control_flow) { |
485 | 1 | case ControlFlow::kQuit: | |
486 | 1 | return false; | |
487 | ✗ | case ControlFlow::kSupervise: | |
488 | ✗ | break; | |
489 | ✗ | default: | |
490 | ✗ | LogEmergency("Internal error: invalid control flow"); | |
491 | ✗ | return false; | |
492 | } | ||
493 | |||
494 | size_t size; | ||
495 | ✗ | pipe_watchdog_->Read(&size); | |
496 | ✗ | crash_dump_path_.resize(size); | |
497 | ✗ | if (size > 0) { | |
498 | ✗ | pipe_watchdog_->Read(&crash_dump_path_[0], size); | |
499 | |||
500 | ✗ | const int retval = chdir(GetParentPath(crash_dump_path_).c_str()); | |
501 | ✗ | if (retval != 0) { | |
502 | ✗ | LogEmergency(std::string("Cannot change to crash dump directory: ") | |
503 | ✗ | + crash_dump_path_); | |
504 | ✗ | return false; | |
505 | } | ||
506 | ✗ | crash_dump_path_ = GetFileName(crash_dump_path_); | |
507 | } | ||
508 | ✗ | return true; | |
509 | 49 | } | |
510 | |||
511 | /** | ||
512 | * Set up the signal handling and kick off the supervision. | ||
513 | */ | ||
514 | ✗ | void Watchdog::Spawn(const std::string &crash_dump_path) { | |
515 | // lower restrictions for ptrace | ||
516 | ✗ | if (!platform_allow_ptrace(watchdog_pid_)) { | |
517 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogWarn, | |
518 | "failed to allow ptrace() for watchdog (PID: %d). " | ||
519 | "Post crash stacktrace might not work", | ||
520 | watchdog_pid_); | ||
521 | } | ||
522 | |||
523 | // Extra stack for signal handlers | ||
524 | ✗ | const int stack_size = kSignalHandlerStacksize; // 2 MB | |
525 | ✗ | sighandler_stack_.ss_sp = smalloc(stack_size); | |
526 | ✗ | sighandler_stack_.ss_size = stack_size; | |
527 | ✗ | sighandler_stack_.ss_flags = 0; | |
528 | ✗ | if (sigaltstack(&sighandler_stack_, NULL) != 0) | |
529 | ✗ | PANIC(NULL); | |
530 | |||
531 | // define our crash signal handler | ||
532 | struct sigaction sa; | ||
533 | ✗ | memset(&sa, 0, sizeof(sa)); | |
534 | ✗ | sa.sa_sigaction = SendTrace; | |
535 | ✗ | sa.sa_flags = SA_SIGINFO | SA_ONSTACK; | |
536 | ✗ | sigfillset(&sa.sa_mask); | |
537 | |||
538 | ✗ | SigactionMap signal_handlers; | |
539 | ✗ | for (size_t i = 0; i < sizeof(g_crash_signals) / sizeof(int); i++) { | |
540 | ✗ | signal_handlers[g_crash_signals[i]] = sa; | |
541 | } | ||
542 | ✗ | old_signal_handlers_ = SetSignalHandlers(signal_handlers); | |
543 | |||
544 | ✗ | pipe_terminate_ = new Pipe<kPipeThreadTerminator>(); | |
545 | const int retval = | ||
546 | ✗ | pthread_create(&thread_listener_, NULL, MainWatchdogListener, this); | |
547 | ✗ | assert(retval == 0); | |
548 | |||
549 | ✗ | pipe_watchdog_->Write(ControlFlow::kSupervise); | |
550 | ✗ | const size_t path_size = crash_dump_path.size(); | |
551 | ✗ | pipe_watchdog_->Write(path_size); | |
552 | ✗ | if (path_size > 0) { | |
553 | ✗ | pipe_watchdog_->Write(crash_dump_path.data(), path_size); | |
554 | } | ||
555 | |||
556 | ✗ | spawned_ = true; | |
557 | } | ||
558 | |||
559 | |||
560 | ✗ | void *Watchdog::MainWatchdogListener(void *data) { | |
561 | ✗ | Watchdog *watchdog = static_cast<Watchdog *>(data); | |
562 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener"); | |
563 | |||
564 | struct pollfd watch_fds[2]; | ||
565 | ✗ | watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd(); | |
566 | ✗ | watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents | |
567 | ✗ | watch_fds[0].revents = 0; | |
568 | ✗ | watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd(); | |
569 | ✗ | watch_fds[1].events = POLLIN | POLLPRI; | |
570 | ✗ | watch_fds[1].revents = 0; | |
571 | while (true) { | ||
572 | ✗ | const int retval = poll(watch_fds, 2, -1); | |
573 | ✗ | if (retval < 0) { | |
574 | ✗ | continue; | |
575 | } | ||
576 | |||
577 | // Terminate I/O thread | ||
578 | ✗ | if (watch_fds[1].revents) | |
579 | ✗ | break; | |
580 | |||
581 | ✗ | if (watch_fds[0].revents) { | |
582 | ✗ | if ((watch_fds[0].revents & POLLERR) || (watch_fds[0].revents & POLLHUP) | |
583 | ✗ | || (watch_fds[0].revents & POLLNVAL)) { | |
584 | ✗ | LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr, | |
585 | "watchdog disappeared, disabling stack trace reporting " | ||
586 | "(revents: %d / %d|%d|%d)", | ||
587 | ✗ | watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL); | |
588 | ✗ | watchdog->SetSignalHandlers(watchdog->old_signal_handlers_); | |
589 | ✗ | PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting"); | |
590 | } | ||
591 | ✗ | PANIC(NULL); | |
592 | } | ||
593 | } | ||
594 | |||
595 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener"); | |
596 | ✗ | return NULL; | |
597 | } | ||
598 | |||
599 | |||
600 | ✗ | void Watchdog::Supervise() { | |
601 | ✗ | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
602 | |||
603 | ✗ | if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) { | |
604 | ✗ | LogEmergency("watchdog: unexpected termination (" | |
605 | ✗ | + StringifyInt(control_flow) + ")"); | |
606 | ✗ | if (on_crash_) | |
607 | ✗ | on_crash_(); | |
608 | } else { | ||
609 | ✗ | switch (control_flow) { | |
610 | ✗ | case ControlFlow::kProduceStacktrace: | |
611 | ✗ | LogEmergency(ReportStacktrace()); | |
612 | ✗ | if (on_crash_) | |
613 | ✗ | on_crash_(); | |
614 | ✗ | break; | |
615 | |||
616 | ✗ | case ControlFlow::kQuit: | |
617 | ✗ | break; | |
618 | |||
619 | ✗ | default: | |
620 | ✗ | LogEmergency("watchdog: unexpected error"); | |
621 | ✗ | break; | |
622 | } | ||
623 | } | ||
624 | } | ||
625 | |||
626 | |||
627 | 49 | Watchdog::Watchdog(FnOnCrash on_crash) | |
628 | 49 | : spawned_(false) | |
629 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | , exe_path_(string(platform_getexepath())) |
630 | 49 | , watchdog_pid_(0) | |
631 |
3/6✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 49 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 49 times.
✗ Branch 10 not taken.
|
98 | , on_crash_(on_crash) { |
632 | 49 | const int retval = platform_spinlock_init(&lock_handler_, 0); | |
633 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(retval == 0); |
634 | 49 | memset(&sighandler_stack_, 0, sizeof(sighandler_stack_)); | |
635 | 49 | } | |
636 | |||
637 | |||
638 | 1 | Watchdog::~Watchdog() { | |
639 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (spawned_) { |
640 | // Reset signal handlers | ||
641 | ✗ | signal(SIGQUIT, SIG_DFL); | |
642 | ✗ | signal(SIGILL, SIG_DFL); | |
643 | ✗ | signal(SIGABRT, SIG_DFL); | |
644 | ✗ | signal(SIGFPE, SIG_DFL); | |
645 | ✗ | signal(SIGSEGV, SIG_DFL); | |
646 | ✗ | signal(SIGBUS, SIG_DFL); | |
647 | ✗ | signal(SIGPIPE, SIG_DFL); | |
648 | ✗ | signal(SIGXFSZ, SIG_DFL); | |
649 | ✗ | free(sighandler_stack_.ss_sp); | |
650 | ✗ | sighandler_stack_.ss_size = 0; | |
651 | |||
652 | ✗ | pipe_terminate_->Write(ControlFlow::kQuit); | |
653 | ✗ | pthread_join(thread_listener_, NULL); | |
654 | ✗ | pipe_terminate_->Close(); | |
655 | } | ||
656 | |||
657 | 1 | pipe_watchdog_->Write(ControlFlow::kQuit); | |
658 | 1 | pipe_watchdog_->CloseWriteFd(); | |
659 | 1 | pipe_listener_->CloseReadFd(); | |
660 | |||
661 | 1 | platform_spinlock_destroy(&lock_handler_); | |
662 | 1 | LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped"); | |
663 | 1 | instance_ = NULL; | |
664 | 1 | } | |
665 |