GCC Code Coverage Report


Directory: cvmfs/
File: cvmfs/monitor.cc
Date: 2026-03-15 02:35:27
Exec Total Coverage
Lines: 90 357 25.2%
Branches: 67 553 12.1%

Line Branch Exec Source
1 /**
2 * This file is part of the CernVM File System.
3 *
4 * This module forks a watchdog process that listens on
5 * a pipe and prints a stackstrace into syslog, when cvmfs
6 * fails.
7 *
8 * Also, it handles getting and setting the maximum number of file descriptors.
9 */
10
11
12 #include "monitor.h"
13
14 #include <errno.h>
15 #include <execinfo.h>
16 #include <poll.h>
17 #include <pthread.h>
18 #include <signal.h>
19 #include <sys/resource.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <sys/ucontext.h>
23 #else
24 #include <ucontext.h>
25 #endif
26 #include <sys/uio.h>
27 #include <sys/wait.h>
28 #include <syslog.h>
29 #include <time.h>
30 #include <unistd.h>
31
32 #include <cassert>
33 #include <cstdio>
34 #include <cstdlib>
35 #include <cstring>
36 #include <map>
37 #include <set>
38 #include <string>
39 #include <vector>
40
41 #if defined(CVMFS_FUSE_MODULE)
42 #include "cvmfs.h"
43 #endif
44 #include "util/capabilities.h"
45 #include "util/exception.h"
46 #include "util/logging.h"
47 #include "util/platform.h"
48 #include "util/posix.h"
49 #include "util/smalloc.h"
50 #include "util/string.h"
51
52 // Used for address offset calculation
53 #if defined(CVMFS_FUSE_MODULE)
54 extern loader::CvmfsExports *g_cvmfs_exports;
55 #endif
56
57 using namespace std; // NOLINT
58
59 Watchdog *Watchdog::instance_ = NULL;
60
61 int Watchdog::g_suppressed_signals[] = {
62 SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGBUS, SIGFPE,
63 SIGUSR1, SIGSEGV, SIGUSR2, SIGTERM, SIGXCPU, SIGXFSZ};
64
65 int Watchdog::g_crash_signals[] = {SIGQUIT, SIGILL, SIGABRT, SIGFPE,
66 SIGSEGV, SIGBUS, SIGPIPE, SIGXFSZ};
67
68 49 Watchdog *Watchdog::Create(FnOnExit on_exit, WatchdogState *saved_state) {
69
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(instance_ == NULL);
70
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 instance_ = new Watchdog(on_exit);
71
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 if (saved_state != NULL)
72 instance_->RestoreState(saved_state);
73 else
74 49 instance_->Fork();
75 49 return instance_;
76 }
77
78
79 /**
80 * Uses an external shell and gdb to create a full stack trace of the dying
81 * process. The same shell is used to force-quit the client afterwards.
82 */
83 string Watchdog::GenerateStackTrace(pid_t pid) {
84 int retval;
85 string result = "";
86
87 // Get capability to ptrace the dead main cvmfs2 process.
88 // This is often necessary because the main process can have its own
89 // elevated capability which would otherwise block ptrace.
90 if (!ObtainSysPtraceCapability()) {
91 result += "failed to gain ptrace capability... still give it a try\n";
92 }
93
94 // run gdb and attach to the dying process
95 int fd_stdin;
96 int fd_stdout;
97 int fd_stderr;
98 vector<string> argv;
99 argv.push_back("-p");
100 argv.push_back(StringifyInt(pid));
101 pid_t gdb_pid = 0;
102 const bool double_fork = false;
103 retval = ExecuteBinary(&fd_stdin,
104 &fd_stdout,
105 &fd_stderr,
106 #ifdef __APPLE__
107 "lldb",
108 #else
109 "gdb",
110 #endif
111 argv,
112 double_fork,
113 &gdb_pid);
114 assert(retval);
115
116
117 // Skip the gdb startup output
118 ReadUntilGdbPrompt(fd_stdout);
119
120 // Send stacktrace command to gdb
121 #ifdef __APPLE__
122 const string gdb_cmd = "bt all\n"
123 "quit\n";
124 #else
125 const string gdb_cmd = "thread apply all bt\n"
126 "quit\n";
127 #endif
128 // The execve can have failed, which can't be detected in ExecuteBinary.
129 // Instead, writing to the pipe will fail.
130 const ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length());
131 if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) {
132 result += "failed to start gdb/lldb (" + StringifyInt(nbytes)
133 + " bytes "
134 "written, errno "
135 + StringifyInt(errno) + ")\n";
136 return result;
137 }
138
139 // Read the stack trace from the stdout of our gdb process
140 #ifdef __APPLE__
141 // lldb has one more prompt
142 result += ReadUntilGdbPrompt(fd_stdout);
143 #endif
144 result += ReadUntilGdbPrompt(fd_stdout) + "\n\n";
145
146 // Check for output on stderr
147 string result_err;
148 Block2Nonblock(fd_stderr);
149 char cbuf;
150 while (read(fd_stderr, &cbuf, 1) == 1)
151 result_err.push_back(cbuf);
152 if (!result_err.empty())
153 result += "\nError output:\n" + result_err + "\n";
154
155 // Close the connection to the terminated gdb process
156 close(fd_stderr);
157 close(fd_stdout);
158 close(fd_stdin);
159
160 // Make sure gdb has terminated (wait for it for a short while)
161 unsigned int timeout = 15;
162 int statloc;
163 while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) {
164 --timeout;
165 SafeSleepMs(1000);
166 }
167
168 // when the timeout expired, gdb probably hangs... we need to kill it
169 if (timeout == 0) {
170 result += "gdb did not exit as expected. sending SIGKILL... ";
171 result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n";
172 }
173
174 return result;
175 }
176
177
178 pid_t Watchdog::GetPid() {
179 if (instance_ != NULL) {
180 return instance_->watchdog_pid_;
181 }
182 return getpid();
183 }
184
185 /**
186 * Log a string to syslog and into the crash dump file.
187 * We expect ideally nothing to be logged, so that file is created on demand.
188 */
189 void Watchdog::LogEmergency(string msg) {
190 char ctime_buffer[32];
191
192 if (!crash_dump_path_.empty()) {
193 FILE *fp = fopen(crash_dump_path_.c_str(), "a");
194 if (fp) {
195 const time_t now = time(NULL);
196 msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer));
197 if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) {
198 msg += " (failed to report into crash dump file " + crash_dump_path_
199 + ")";
200 } else {
201 msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n";
202 }
203 fclose(fp);
204 } else {
205 msg += " (failed to open crash dump file " + crash_dump_path_ + ")";
206 }
207 }
208 LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str());
209 }
210
211 /**
212 * Reads from the file descriptor until the specific gdb prompt is reached or
213 * the pipe gets broken.
214 *
215 * @param fd_pipe the file descriptor of the pipe to be read
216 * @return the data read from the pipe
217 */
218 string Watchdog::ReadUntilGdbPrompt(int fd_pipe) {
219 #ifdef __APPLE__
220 static const string gdb_prompt = "(lldb)";
221 #else
222 static const string gdb_prompt = "\n(gdb) ";
223 #endif
224
225 string result;
226 char mini_buffer;
227 int chars_io;
228 unsigned int ring_buffer_pos = 0;
229
230 // read from stdout of gdb until gdb prompt occurs --> (gdb)
231 while (1) {
232 chars_io = read(fd_pipe, &mini_buffer, 1);
233
234 // in case something goes wrong...
235 if (chars_io <= 0)
236 break;
237
238 result += mini_buffer;
239
240 // find the gdb_promt in the stdout data
241 if (mini_buffer == gdb_prompt[ring_buffer_pos]) {
242 ++ring_buffer_pos;
243 if (ring_buffer_pos == gdb_prompt.size()) {
244 break;
245 }
246 } else {
247 ring_buffer_pos = 0;
248 }
249 }
250
251 return result;
252 }
253
254
255 /**
256 * Generates useful information from the backtrace log in the pipe.
257 */
258 string Watchdog::ReportStacktrace() {
259 CrashData crash_data;
260 if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) {
261 return "failed to read crash data (" + StringifyInt(errno) + ")";
262 }
263
264 string debug = "--\n";
265 debug += "Signal: " + StringifyInt(crash_data.signal);
266 debug += ", errno: " + StringifyInt(crash_data.sys_errno);
267 debug += ", version: " + string(CVMFS_VERSION);
268 debug += ", PID: " + StringifyInt(crash_data.pid) + "\n";
269 debug += "Executable path: " + exe_path_ + "\n";
270
271 debug += GenerateStackTrace(crash_data.pid);
272
273 // Give the dying process the finishing stroke
274 if (kill(crash_data.pid, SIGKILL) != 0) {
275 debug += "Failed to kill cvmfs client! (";
276 switch (errno) {
277 case EINVAL:
278 debug += "invalid signal";
279 break;
280 case EPERM:
281 debug += "permission denied";
282 break;
283 case ESRCH:
284 debug += "no such process";
285 break;
286 default:
287 debug += "unknown error " + StringifyInt(errno);
288 }
289 debug += ")\n\n";
290 }
291
292 return debug;
293 }
294
295
296 void Watchdog::ReportSignalAndContinue(int sig, siginfo_t *siginfo,
297 void * /* context */) {
298 LogCvmfs(kLogMonitor, kLogSyslogErr,
299 "watchdog: received unexpected signal %d from PID %d / UID %d", sig,
300 siginfo->si_pid, siginfo->si_uid);
301 }
302
303
304 void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) {
305 const int send_errno = errno;
306 if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) {
307 // Concurrent call, wait for the first one to exit the process
308 while (true) {
309 }
310 }
311
312 // Set the original signal handler for the raised signal in
313 // SIGQUIT (watchdog process will raise SIGQUIT)
314 (void)sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL);
315
316 // Inform the watchdog that CernVM-FS crashed
317 if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) {
318 _exit(1);
319 }
320
321 // Send crash information to the watchdog
322 CrashData crash_data;
323 crash_data.signal = sig;
324 crash_data.sys_errno = send_errno;
325 crash_data.pid = getpid();
326 if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) {
327 _exit(1);
328 }
329
330 // Do not die before the stack trace was generated
331 // kill -SIGQUIT <pid> will finish this
332 int counter = 0;
333 while (true) {
334 SafeSleepMs(100);
335 // quit anyway after 30 seconds
336 if (++counter == 300) {
337 LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed");
338 // Last attempt to log something useful
339 #if defined(CVMFS_FUSE_MODULE)
340 LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", sig,
341 send_errno);
342 void *addr[kMaxBacktrace];
343 // Note: this doesn't work due to the signal stack on OS X (it works on
344 // Linux). Since anyway lldb is supposed to produce the backtrace, we
345 // consider it more important to protect cvmfs against stack overflows.
346 const int num_addr = backtrace(addr, kMaxBacktrace);
347 char **symbols = backtrace_symbols(addr, num_addr);
348 string backtrace = "Backtrace (" + StringifyInt(num_addr)
349 + " symbols):\n";
350 for (int i = 0; i < num_addr; ++i)
351 backtrace += string(symbols[i]) + "\n";
352 LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str());
353 LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p",
354 &g_cvmfs_exports);
355 #endif
356
357 _exit(1);
358 }
359 }
360
361 _exit(1);
362 }
363
364
365 /**
366 * Sets the signal handlers of the current process according to the ones
367 * defined in the given SigactionMap.
368 *
369 * @param signal_handlers a map of SIGNAL -> struct sigaction
370 * @return a SigactionMap containing the old handlers
371 */
372 49 Watchdog::SigactionMap Watchdog::SetSignalHandlers(
373 const SigactionMap &signal_handlers) {
374 49 SigactionMap old_signal_handlers;
375 49 SigactionMap::const_iterator i = signal_handlers.begin();
376 49 const SigactionMap::const_iterator iend = signal_handlers.end();
377
2/2
✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
686 for (; i != iend; ++i) {
378 struct sigaction old_signal_handler;
379
1/2
✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
637 if (sigaction(i->first, &i->second, &old_signal_handler) != 0) {
380 PANIC(NULL);
381 }
382
1/2
✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
637 old_signal_handlers[i->first] = old_signal_handler;
383 }
384
385 98 return old_signal_handlers;
386 }
387
388
389 /**
390 * Fork the watchdog process and put it on hold until Spawn() is called.
391 */
392 49 void Watchdog::Fork() {
393
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 Pipe<kPipeWatchdogPid> pipe_pid;
394
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_watchdog_ = new Pipe<kPipeWatchdog>();
395
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>();
396
397 pid_t pid;
398 int statloc;
399
1/3
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 switch (pid = fork()) {
400 case -1:
401 PANIC(NULL);
402 case 0:
403 // Double fork to avoid zombie
404 switch (fork()) {
405 case -1:
406 _exit(1);
407 case 0: {
408 pipe_watchdog_->CloseWriteFd();
409 Daemonize();
410
4/8
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 49 times.
49 if ((geteuid() != 0) && SetuidCapabilityPermitted()) {
411 const std::vector<cap_value_t> nocaps;
412 if (on_exit_) {
413 // Reduce to minimum capabilities, which unfortunately is
414 // still quite powerful.
415 // CAP_SYS_ADMIN is needed to unmount, and CAP_SYS_PTRACE
416 // is needed in order to get a stack trace since one of
417 // the main process threads is privileged.
418 const std::vector<cap_value_t> reservecaps = {CAP_SYS_ADMIN, CAP_SYS_PTRACE};
419 const std::vector<cap_value_t> inheritcaps = {CAP_SYS_PTRACE};
420 assert(ClearPermittedCapabilities(reservecaps, inheritcaps));
421 } else {
422 // Only need to be able to do the stack trace, and the
423 // main process needs no extra capabilities, so we can
424 // drop all capabilities.
425 assert(ClearPermittedCapabilities(nocaps, nocaps));
426 }
427 }
428 // send the watchdog PID to the supervisee
429 49 const pid_t watchdog_pid = getpid();
430
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Write(watchdog_pid);
431
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
432 // Close all unused file descriptors
433 // close also usyslog, only get it back if necessary
434 // string usyslog_save = GetLogMicroSyslog();
435
1/3
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
49 const string debuglog_save = GetLogDebugFile();
436
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogDebugFile("");
437
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 const string usyslog_save = GetLogMicroSyslog();
438
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogMicroSyslog("");
439 // Gracefully close the syslog before closing all fds. The next call
440 // to syslog will reopen it.
441
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 closelog();
442 // Let's keep stdin, stdout, stderr open at /dev/null (daemonized)
443 // in order to prevent accidental outputs from messing with another
444 // file descriptor
445 49 std::set<int> preserve_fds;
446
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(0);
447
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(1);
448
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(2);
449
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_watchdog_->GetReadFd());
450
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_listener_->GetWriteFd());
451
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 CloseAllFildes(preserve_fds);
452
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used
453
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogDebugFile(debuglog_save); // no-op if debug log not used
454
455
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (WaitForSupervisee())
456 Supervise();
457
458
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
459
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
460 49 exit(0);
461 }
462 default:
463 _exit(0);
464 }
465 49 default:
466
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
467
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
468
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
469
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (waitpid(pid, &statloc, 0) != pid)
470 PANIC(NULL);
471
2/4
✓ Branch 0 taken 49 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 if (!WIFEXITED(statloc) || WEXITSTATUS(statloc))
472 PANIC(NULL);
473 }
474
475 // retrieve the watchdog PID from the pipe
476
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Read(&watchdog_pid_);
477
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseReadFd();
478 49 }
479
480
481 49 bool Watchdog::WaitForSupervisee() {
482 // We want broken pipes not to raise a signal but handle the error in the
483 // read/write code
484 49 platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN);
485
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(rv_sig != SIG_ERR);
486
487 // The watchdog is not supposed to receive signals. If it does, report it.
488 struct sigaction sa;
489 49 memset(&sa, 0, sizeof(sa));
490 49 sa.sa_sigaction = ReportSignalAndContinue;
491 49 sa.sa_flags = SA_SIGINFO;
492 49 sigfillset(&sa.sa_mask);
493
494 49 SigactionMap signal_handlers;
495
2/2
✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
686 for (size_t i = 0; i < sizeof(g_suppressed_signals) / sizeof(int); i++) {
496
1/2
✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
637 signal_handlers[g_suppressed_signals[i]] = sa;
497 }
498
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetSignalHandlers(signal_handlers);
499
500 49 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
501
502
3/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
49 if (!pipe_watchdog_->TryRead(&control_flow)) {
503
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog");
504 48 return false;
505 }
506
507
1/3
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
1 switch (control_flow) {
508 1 case ControlFlow::kQuit:
509 1 return false;
510 case ControlFlow::kSupervise:
511 break;
512 default:
513 LogEmergency("Internal error: invalid control flow");
514 return false;
515 }
516
517 size_t size;
518 pipe_watchdog_->Read(&size);
519 crash_dump_path_.resize(size);
520 if (size > 0) {
521 pipe_watchdog_->Read(&crash_dump_path_[0], size);
522
523 const int retval = chdir(GetParentPath(crash_dump_path_).c_str());
524 if (retval != 0) {
525 LogEmergency(std::string("Cannot change to crash dump directory: ")
526 + crash_dump_path_);
527 return false;
528 }
529 crash_dump_path_ = GetFileName(crash_dump_path_);
530 }
531 return true;
532 49 }
533
534
535 /**
536 * Set up the signal handling and kick off the supervision.
537 */
538 void Watchdog::Spawn(const std::string &crash_dump_path) {
539 // lower restrictions for ptrace
540 if (!platform_allow_ptrace(watchdog_pid_)) {
541 LogCvmfs(kLogMonitor, kLogSyslogWarn,
542 "failed to allow ptrace() for watchdog (PID: %d). "
543 "Post crash stacktrace might not work",
544 watchdog_pid_);
545 }
546
547 // Extra stack for signal handlers
548 const int stack_size = kSignalHandlerStacksize; // 2 MB
549 sighandler_stack_.ss_sp = smalloc(stack_size);
550 sighandler_stack_.ss_size = stack_size;
551 sighandler_stack_.ss_flags = 0;
552 if (sigaltstack(&sighandler_stack_, NULL) != 0)
553 PANIC(NULL);
554
555 // define our crash signal handler
556 struct sigaction sa;
557 memset(&sa, 0, sizeof(sa));
558 sa.sa_sigaction = SendTrace;
559 sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
560 sigfillset(&sa.sa_mask);
561
562 SigactionMap signal_handlers;
563 for (size_t i = 0; i < sizeof(g_crash_signals) / sizeof(int); i++) {
564 signal_handlers[g_crash_signals[i]] = sa;
565 }
566 old_signal_handlers_ = SetSignalHandlers(signal_handlers);
567
568 pipe_terminate_ = new Pipe<kPipeThreadTerminator>();
569 const int retval = pthread_create(&thread_listener_, NULL,
570 MainWatchdogListener, this);
571 assert(retval == 0);
572
573 if (spawned_) {
574 // This happens after a reload, when the watchdog process is
575 // already running so we can exit here.
576 return;
577 }
578
579 pipe_watchdog_->Write(ControlFlow::kSupervise);
580 const size_t path_size = crash_dump_path.size();
581 pipe_watchdog_->Write(path_size);
582 if (path_size > 0) {
583 pipe_watchdog_->Write(crash_dump_path.data(), path_size);
584 }
585
586 spawned_ = true;
587 }
588
589
590 void *Watchdog::MainWatchdogListener(void *data) {
591 Watchdog *watchdog = static_cast<Watchdog *>(data);
592 LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener");
593
594 if ((getuid() != 0) && SetuidCapabilityPermitted()) {
595 // Drop all capabilities, none are needed in the listener
596 const std::vector<cap_value_t> nocaps;
597 assert(ClearPermittedCapabilities(nocaps, nocaps));
598 }
599
600 struct pollfd watch_fds[2];
601 watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd();
602 watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents
603 watch_fds[0].revents = 0;
604 watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd();
605 watch_fds[1].events = POLLIN | POLLPRI;
606 watch_fds[1].revents = 0;
607 while (true) {
608 const int retval = poll(watch_fds, 2, -1);
609 if (retval < 0) {
610 continue;
611 }
612
613 // Terminate I/O thread
614 if (watch_fds[1].revents)
615 break;
616
617 if (watch_fds[0].revents) {
618 if ((watch_fds[0].revents & POLLERR) || (watch_fds[0].revents & POLLHUP)
619 || (watch_fds[0].revents & POLLNVAL)) {
620 LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr,
621 "watchdog disappeared, disabling stack trace reporting "
622 "(revents: %d / %d|%d|%d)",
623 watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL);
624 watchdog->SetSignalHandlers(watchdog->old_signal_handlers_);
625 PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting");
626 }
627 PANIC(NULL);
628 }
629 }
630
631 LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener");
632 return NULL;
633 }
634
635
636 void Watchdog::Supervise() {
637 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
638
639 if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) {
640 LogEmergency("watchdog: unexpected termination ("
641 + StringifyInt(control_flow) + ")");
642 if (on_exit_)
643 on_exit_(true /* crashed */);
644 } else {
645 switch (control_flow) {
646 case ControlFlow::kProduceStacktrace:
647 LogEmergency(ReportStacktrace());
648 if (on_exit_)
649 on_exit_(true /* crashed */);
650 break;
651
652 case ControlFlow::kQuitWithExit:
653 if (on_exit_)
654 on_exit_(false /* crashed */);
655 break;
656
657 case ControlFlow::kQuit:
658 break;
659
660 default:
661 LogEmergency("watchdog: unexpected error");
662 break;
663 }
664 }
665 }
666
667
668 /**
669 * Save the state of the watchdog listener thread before reload.
670 */
671 void Watchdog::SaveState(WatchdogState *saved_state) {
672 saved_state->spawned = spawned_;
673 saved_state->pid = watchdog_pid_;
674 if (spawned_) {
675 saved_state->watchdog_write_fd = pipe_watchdog_->GetWriteFd();
676 saved_state->listener_read_fd = pipe_listener_->GetReadFd();
677 }
678 }
679
680
681 /**
682 * Restore the state of the watchdog listener reload
683 */
684 void Watchdog::RestoreState(WatchdogState *saved_state) {
685 watchdog_pid_ = saved_state->pid;
686 if (!saved_state->spawned) {
687 return;
688 }
689 pipe_watchdog_ = new Pipe<kPipeWatchdog>(-1, saved_state->watchdog_write_fd);
690 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(saved_state->listener_read_fd, -1);
691 spawned_ = true;
692 }
693
694
695 49 Watchdog::Watchdog(FnOnExit on_exit)
696 49 : spawned_(false)
697 49 , maintenance_mode_(false)
698
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 , exe_path_(string(platform_getexepath()))
699 49 , watchdog_pid_(0)
700
3/6
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 49 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 49 times.
✗ Branch 10 not taken.
98 , on_exit_(on_exit)
701 {
702 49 const int retval = platform_spinlock_init(&lock_handler_, 0);
703
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(retval == 0);
704 49 memset(&sighandler_stack_, 0, sizeof(sighandler_stack_));
705 49 }
706
707
708 1 Watchdog::~Watchdog() {
709
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (spawned_) {
710 // Reset signal handlers
711 signal(SIGQUIT, SIG_DFL);
712 signal(SIGILL, SIG_DFL);
713 signal(SIGABRT, SIG_DFL);
714 signal(SIGFPE, SIG_DFL);
715 signal(SIGSEGV, SIG_DFL);
716 signal(SIGBUS, SIG_DFL);
717 signal(SIGPIPE, SIG_DFL);
718 signal(SIGXFSZ, SIG_DFL);
719 free(sighandler_stack_.ss_sp);
720 sighandler_stack_.ss_size = 0;
721
722 // The watchdog listener thread exits on any message received
723 pipe_terminate_->Write(ControlFlow::kQuit);
724 pthread_join(thread_listener_, NULL);
725 pipe_terminate_->Close();
726 }
727
728
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (!maintenance_mode_) {
729 // Shutdown the watchdog except when doing a reload
730
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (on_exit_) {
731 pipe_watchdog_->Write(ControlFlow::kQuitWithExit);
732 } else {
733 1 pipe_watchdog_->Write(ControlFlow::kQuit);
734 }
735 1 pipe_watchdog_->CloseWriteFd();
736 1 pipe_listener_->CloseReadFd();
737 } else {
738 // Release the references to the watchdog pipes without closing them
739 pipe_watchdog_.Release();
740 pipe_listener_.Release();
741 }
742
743 1 platform_spinlock_destroy(&lock_handler_);
744 1 LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped");
745 1 instance_ = NULL;
746 1 }
747