GCC Code Coverage Report


Directory: cvmfs/
File: cvmfs/monitor.cc
Date: 2026-04-05 02:35:23
Exec Total Coverage
Lines: 90 363 24.8%
Branches: 67 566 11.8%

Line Branch Exec Source
1 /**
2 * This file is part of the CernVM File System.
3 *
4 * This module forks a watchdog process that listens on
5 * a pipe and prints a stackstrace into syslog, when cvmfs
6 * fails.
7 *
8 * Also, it handles getting and setting the maximum number of file descriptors.
9 */
10
11
12 #include "monitor.h"
13
14 #include <errno.h>
15 #include <execinfo.h>
16 #include <poll.h>
17 #include <pthread.h>
18 #include <signal.h>
19 #include <sys/resource.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <sys/ucontext.h>
23 #else
24 #include <ucontext.h>
25 #endif
26 #include <sys/uio.h>
27 #include <sys/wait.h>
28 #include <syslog.h>
29 #include <time.h>
30 #include <unistd.h>
31
32 #include <cassert>
33 #include <cstdio>
34 #include <cstdlib>
35 #include <cstring>
36 #include <map>
37 #include <set>
38 #include <string>
39 #include <vector>
40
41 #if defined(CVMFS_FUSE_MODULE)
42 #include "cvmfs.h"
43 #endif
44 #include "util/capabilities.h"
45 #include "util/exception.h"
46 #include "util/logging.h"
47 #include "util/platform.h"
48 #include "util/posix.h"
49 #include "util/smalloc.h"
50 #include "util/string.h"
51
52 // Used for address offset calculation
53 #if defined(CVMFS_FUSE_MODULE)
54 extern loader::CvmfsExports *g_cvmfs_exports;
55 #endif
56
57 using namespace std; // NOLINT
58
59 Watchdog *Watchdog::instance_ = NULL;
60
61 int Watchdog::g_suppressed_signals[] = {
62 SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGBUS, SIGFPE,
63 SIGUSR1, SIGSEGV, SIGUSR2, SIGTERM, SIGXCPU, SIGXFSZ};
64
65 int Watchdog::g_crash_signals[] = {SIGQUIT, SIGILL, SIGABRT, SIGFPE,
66 SIGSEGV, SIGBUS, SIGPIPE, SIGXFSZ};
67
68 49 Watchdog *Watchdog::Create(FnOnExit on_exit,
69 bool needs_read_environ,
70 WatchdogState *saved_state) {
71
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(instance_ == NULL);
72
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 instance_ = new Watchdog(on_exit);
73
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 if (saved_state != NULL)
74 instance_->RestoreState(saved_state);
75 else
76 49 instance_->Fork(needs_read_environ);
77 49 return instance_;
78 }
79
80
81 /**
82 * Uses an external shell and gdb to create a full stack trace of the dying
83 * process. The same shell is used to force-quit the client afterwards.
84 */
85 string Watchdog::GenerateStackTrace(pid_t pid) {
86 int retval;
87 string result = "";
88
89 // Get capability to ptrace the dead main cvmfs2 process.
90 // This is often necessary because the main process can have its own
91 // elevated capability which would otherwise block ptrace.
92 if (!ObtainSysPtraceCapability()) {
93 result += "failed to gain ptrace capability... still give it a try\n";
94 }
95
96 // run gdb and attach to the dying process
97 int fd_stdin;
98 int fd_stdout;
99 int fd_stderr;
100 vector<string> argv;
101 argv.push_back("-p");
102 argv.push_back(StringifyInt(pid));
103 pid_t gdb_pid = 0;
104 const bool double_fork = false;
105 retval = ExecuteBinary(&fd_stdin,
106 &fd_stdout,
107 &fd_stderr,
108 #ifdef __APPLE__
109 "lldb",
110 #else
111 "gdb",
112 #endif
113 argv,
114 double_fork,
115 &gdb_pid);
116 assert(retval);
117
118
119 // Skip the gdb startup output
120 ReadUntilGdbPrompt(fd_stdout);
121
122 // Send stacktrace command to gdb
123 #ifdef __APPLE__
124 const string gdb_cmd = "bt all\n"
125 "quit\n";
126 #else
127 const string gdb_cmd = "thread apply all bt\n"
128 "quit\n";
129 #endif
130 // The execve can have failed, which can't be detected in ExecuteBinary.
131 // Instead, writing to the pipe will fail.
132 const ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length());
133 if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) {
134 result += "failed to start gdb/lldb (" + StringifyInt(nbytes)
135 + " bytes "
136 "written, errno "
137 + StringifyInt(errno) + ")\n";
138 return result;
139 }
140
141 // Read the stack trace from the stdout of our gdb process
142 #ifdef __APPLE__
143 // lldb has one more prompt
144 result += ReadUntilGdbPrompt(fd_stdout);
145 #endif
146 result += ReadUntilGdbPrompt(fd_stdout) + "\n\n";
147
148 // Check for output on stderr
149 string result_err;
150 Block2Nonblock(fd_stderr);
151 char cbuf;
152 while (read(fd_stderr, &cbuf, 1) == 1)
153 result_err.push_back(cbuf);
154 if (!result_err.empty())
155 result += "\nError output:\n" + result_err + "\n";
156
157 // Close the connection to the terminated gdb process
158 close(fd_stderr);
159 close(fd_stdout);
160 close(fd_stdin);
161
162 // Make sure gdb has terminated (wait for it for a short while)
163 unsigned int timeout = 15;
164 int statloc;
165 while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) {
166 --timeout;
167 SafeSleepMs(1000);
168 }
169
170 // when the timeout expired, gdb probably hangs... we need to kill it
171 if (timeout == 0) {
172 result += "gdb did not exit as expected. sending SIGKILL... ";
173 result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n";
174 }
175
176 return result;
177 }
178
179
180 pid_t Watchdog::GetPid() {
181 if (instance_ != NULL) {
182 return instance_->watchdog_pid_;
183 }
184 return getpid();
185 }
186
187 /**
188 * Log a string to syslog and into the crash dump file.
189 * We expect ideally nothing to be logged, so that file is created on demand.
190 */
191 void Watchdog::LogEmergency(string msg) {
192 char ctime_buffer[32];
193
194 if (!crash_dump_path_.empty()) {
195 FILE *fp = fopen(crash_dump_path_.c_str(), "a");
196 if (fp) {
197 const time_t now = time(NULL);
198 msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer));
199 if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) {
200 msg += " (failed to report into crash dump file " + crash_dump_path_
201 + ")";
202 } else {
203 msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n";
204 }
205 fclose(fp);
206 } else {
207 msg += " (failed to open crash dump file " + crash_dump_path_ + ")";
208 }
209 }
210 LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str());
211 }
212
213 /**
214 * Reads from the file descriptor until the specific gdb prompt is reached or
215 * the pipe gets broken.
216 *
217 * @param fd_pipe the file descriptor of the pipe to be read
218 * @return the data read from the pipe
219 */
220 string Watchdog::ReadUntilGdbPrompt(int fd_pipe) {
221 #ifdef __APPLE__
222 static const string gdb_prompt = "(lldb)";
223 #else
224 static const string gdb_prompt = "\n(gdb) ";
225 #endif
226
227 string result;
228 char mini_buffer;
229 int chars_io;
230 unsigned int ring_buffer_pos = 0;
231
232 // read from stdout of gdb until gdb prompt occurs --> (gdb)
233 while (1) {
234 chars_io = read(fd_pipe, &mini_buffer, 1);
235
236 // in case something goes wrong...
237 if (chars_io <= 0)
238 break;
239
240 result += mini_buffer;
241
242 // find the gdb_promt in the stdout data
243 if (mini_buffer == gdb_prompt[ring_buffer_pos]) {
244 ++ring_buffer_pos;
245 if (ring_buffer_pos == gdb_prompt.size()) {
246 break;
247 }
248 } else {
249 ring_buffer_pos = 0;
250 }
251 }
252
253 return result;
254 }
255
256
257 /**
258 * Generates useful information from the backtrace log in the pipe.
259 */
260 string Watchdog::ReportStacktrace() {
261 CrashData crash_data;
262 if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) {
263 return "failed to read crash data (" + StringifyInt(errno) + ")";
264 }
265
266 string debug = "--\n";
267 debug += "Signal: " + StringifyInt(crash_data.signal);
268 debug += ", errno: " + StringifyInt(crash_data.sys_errno);
269 debug += ", version: " + string(CVMFS_VERSION);
270 debug += ", PID: " + StringifyInt(crash_data.pid) + "\n";
271 debug += "Executable path: " + exe_path_ + "\n";
272
273 debug += GenerateStackTrace(crash_data.pid);
274
275 // Give the dying process the finishing stroke
276 if (kill(crash_data.pid, SIGKILL) != 0) {
277 debug += "Failed to kill cvmfs client! (";
278 switch (errno) {
279 case EINVAL:
280 debug += "invalid signal";
281 break;
282 case EPERM:
283 debug += "permission denied";
284 break;
285 case ESRCH:
286 debug += "no such process";
287 break;
288 default:
289 debug += "unknown error " + StringifyInt(errno);
290 }
291 debug += ")\n\n";
292 }
293
294 return debug;
295 }
296
297
298 void Watchdog::ReportSignalAndContinue(int sig, siginfo_t *siginfo,
299 void * /* context */) {
300 LogCvmfs(kLogMonitor, kLogSyslogErr,
301 "watchdog: received unexpected signal %d from PID %d / UID %d", sig,
302 siginfo->si_pid, siginfo->si_uid);
303 }
304
305
306 void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) {
307 const int send_errno = errno;
308 if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) {
309 // Concurrent call, wait for the first one to exit the process
310 while (true) {
311 }
312 }
313
314 // Set the original signal handler for the raised signal in
315 // SIGQUIT (watchdog process will raise SIGQUIT)
316 (void)sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL);
317
318 // Inform the watchdog that CernVM-FS crashed
319 if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) {
320 _exit(1);
321 }
322
323 // Send crash information to the watchdog
324 CrashData crash_data;
325 crash_data.signal = sig;
326 crash_data.sys_errno = send_errno;
327 crash_data.pid = getpid();
328 if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) {
329 _exit(1);
330 }
331
332 // Do not die before the stack trace was generated
333 // kill -SIGQUIT <pid> will finish this
334 int counter = 0;
335 while (true) {
336 SafeSleepMs(100);
337 // quit anyway after 30 seconds
338 if (++counter == 300) {
339 LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed");
340 // Last attempt to log something useful
341 #if defined(CVMFS_FUSE_MODULE)
342 LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", sig,
343 send_errno);
344 void *addr[kMaxBacktrace];
345 // Note: this doesn't work due to the signal stack on OS X (it works on
346 // Linux). Since anyway lldb is supposed to produce the backtrace, we
347 // consider it more important to protect cvmfs against stack overflows.
348 const int num_addr = backtrace(addr, kMaxBacktrace);
349 char **symbols = backtrace_symbols(addr, num_addr);
350 string backtrace = "Backtrace (" + StringifyInt(num_addr)
351 + " symbols):\n";
352 for (int i = 0; i < num_addr; ++i)
353 backtrace += string(symbols[i]) + "\n";
354 LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str());
355 LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p",
356 &g_cvmfs_exports);
357 #endif
358
359 _exit(1);
360 }
361 }
362
363 _exit(1);
364 }
365
366
367 /**
368 * Sets the signal handlers of the current process according to the ones
369 * defined in the given SigactionMap.
370 *
371 * @param signal_handlers a map of SIGNAL -> struct sigaction
372 * @return a SigactionMap containing the old handlers
373 */
374 49 Watchdog::SigactionMap Watchdog::SetSignalHandlers(
375 const SigactionMap &signal_handlers) {
376 49 SigactionMap old_signal_handlers;
377 49 SigactionMap::const_iterator i = signal_handlers.begin();
378 49 const SigactionMap::const_iterator iend = signal_handlers.end();
379
2/2
✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
686 for (; i != iend; ++i) {
380 struct sigaction old_signal_handler;
381
1/2
✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
637 if (sigaction(i->first, &i->second, &old_signal_handler) != 0) {
382 PANIC(NULL);
383 }
384
1/2
✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
637 old_signal_handlers[i->first] = old_signal_handler;
385 }
386
387 98 return old_signal_handlers;
388 }
389
390
391 /**
392 * Fork the watchdog process and put it on hold until Spawn() is called.
393 */
394 49 void Watchdog::Fork(bool needs_read_environ) {
395
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 Pipe<kPipeWatchdogPid> pipe_pid;
396
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_watchdog_ = new Pipe<kPipeWatchdog>();
397
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>();
398
399 pid_t pid;
400 int statloc;
401
1/3
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 switch (pid = fork()) {
402 case -1:
403 PANIC(NULL);
404 case 0:
405 // Double fork to avoid zombie
406 switch (fork()) {
407 case -1:
408 _exit(1);
409 case 0: {
410 pipe_watchdog_->CloseWriteFd();
411 Daemonize();
412
4/8
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 49 times.
49 if ((geteuid() != 0) && SetuidCapabilityPermitted()) {
413 const std::vector<cap_value_t> nocaps;
414 if (on_exit_) {
415 // Reduce to minimum capabilities, which unfortunately is
416 // still quite powerful.
417 // CAP_SYS_ADMIN is needed to unmount, and CAP_SYS_PTRACE
418 // is needed when needs_read_environ is true because then
419 // the main Fuse process has elevated capabilities and
420 // ptrace (needed for collecting a stack trace) is not
421 // allowed on a process with more capabilities.
422 if (needs_read_environ) {
423 const std::vector<cap_value_t> reservecaps = {CAP_SYS_ADMIN, CAP_SYS_PTRACE};
424 const std::vector<cap_value_t> inheritcaps = {CAP_SYS_PTRACE};
425 assert(ClearPermittedCapabilities(reservecaps, inheritcaps));
426 } else {
427 const std::vector<cap_value_t> reservecaps = {CAP_SYS_ADMIN};
428 assert(ClearPermittedCapabilities(reservecaps, nocaps));
429 }
430 } else {
431 // Only need to be able to do the stack trace, and the
432 // main process needs no extra capabilities, so we can
433 // drop all capabilities.
434 assert(ClearPermittedCapabilities(nocaps, nocaps));
435 }
436 }
437 // send the watchdog PID to the supervisee
438 49 const pid_t watchdog_pid = getpid();
439
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Write(watchdog_pid);
440
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
441 // Close all unused file descriptors
442 // close also usyslog, only get it back if necessary
443 // string usyslog_save = GetLogMicroSyslog();
444
1/3
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
49 const string debuglog_save = GetLogDebugFile();
445
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogDebugFile("");
446
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 const string usyslog_save = GetLogMicroSyslog();
447
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogMicroSyslog("");
448 // Gracefully close the syslog before closing all fds. The next call
449 // to syslog will reopen it.
450
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 closelog();
451 // Let's keep stdin, stdout, stderr open at /dev/null (daemonized)
452 // in order to prevent accidental outputs from messing with another
453 // file descriptor
454 49 std::set<int> preserve_fds;
455
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(0);
456
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(1);
457
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(2);
458
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_watchdog_->GetReadFd());
459
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_listener_->GetWriteFd());
460
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 CloseAllFildes(preserve_fds);
461
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used
462
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogDebugFile(debuglog_save); // no-op if debug log not used
463
464
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (WaitForSupervisee())
465 Supervise();
466
467
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
468
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
469 49 exit(0);
470 }
471 default:
472 _exit(0);
473 }
474 49 default:
475
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
476
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
477
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
478
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (waitpid(pid, &statloc, 0) != pid)
479 PANIC(NULL);
480
2/4
✓ Branch 0 taken 49 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 if (!WIFEXITED(statloc) || WEXITSTATUS(statloc))
481 PANIC(NULL);
482 }
483
484 // retrieve the watchdog PID from the pipe
485
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Read(&watchdog_pid_);
486
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseReadFd();
487 49 }
488
489
490 49 bool Watchdog::WaitForSupervisee() {
491 // We want broken pipes not to raise a signal but handle the error in the
492 // read/write code
493 49 platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN);
494
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(rv_sig != SIG_ERR);
495
496 // The watchdog is not supposed to receive signals. If it does, report it.
497 struct sigaction sa;
498 49 memset(&sa, 0, sizeof(sa));
499 49 sa.sa_sigaction = ReportSignalAndContinue;
500 49 sa.sa_flags = SA_SIGINFO;
501 49 sigfillset(&sa.sa_mask);
502
503 49 SigactionMap signal_handlers;
504
2/2
✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
686 for (size_t i = 0; i < sizeof(g_suppressed_signals) / sizeof(int); i++) {
505
1/2
✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
637 signal_handlers[g_suppressed_signals[i]] = sa;
506 }
507
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetSignalHandlers(signal_handlers);
508
509 49 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
510
511
3/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
49 if (!pipe_watchdog_->TryRead(&control_flow)) {
512
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog");
513 48 return false;
514 }
515
516
1/4
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1 switch (control_flow) {
517 1 case ControlFlow::kQuit:
518 1 return false;
519 case ControlFlow::kQuitWithExit:
520 if (on_exit_) on_exit_(false);
521 return false;
522 case ControlFlow::kSupervise:
523 break;
524 default:
525 LogEmergency("Internal error: invalid control flow");
526 return false;
527 }
528
529 size_t size;
530 pipe_watchdog_->Read(&size);
531 crash_dump_path_.resize(size);
532 if (size > 0) {
533 pipe_watchdog_->Read(&crash_dump_path_[0], size);
534
535 const int retval = chdir(GetParentPath(crash_dump_path_).c_str());
536 if (retval != 0) {
537 LogEmergency(std::string("Cannot change to crash dump directory: ")
538 + crash_dump_path_);
539 return false;
540 }
541 crash_dump_path_ = GetFileName(crash_dump_path_);
542 }
543 return true;
544 49 }
545
546
547 /**
548 * Set up the signal handling and kick off the supervision.
549 */
550 void Watchdog::Spawn(const std::string &crash_dump_path) {
551 // lower restrictions for ptrace
552 if (!platform_allow_ptrace(watchdog_pid_)) {
553 LogCvmfs(kLogMonitor, kLogSyslogWarn,
554 "failed to allow ptrace() for watchdog (PID: %d). "
555 "Post crash stacktrace might not work",
556 watchdog_pid_);
557 }
558
559 // Extra stack for signal handlers
560 const int stack_size = kSignalHandlerStacksize; // 2 MB
561 sighandler_stack_.ss_sp = smalloc(stack_size);
562 sighandler_stack_.ss_size = stack_size;
563 sighandler_stack_.ss_flags = 0;
564 if (sigaltstack(&sighandler_stack_, NULL) != 0)
565 PANIC(NULL);
566
567 // define our crash signal handler
568 struct sigaction sa;
569 memset(&sa, 0, sizeof(sa));
570 sa.sa_sigaction = SendTrace;
571 sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
572 sigfillset(&sa.sa_mask);
573
574 SigactionMap signal_handlers;
575 for (size_t i = 0; i < sizeof(g_crash_signals) / sizeof(int); i++) {
576 signal_handlers[g_crash_signals[i]] = sa;
577 }
578 old_signal_handlers_ = SetSignalHandlers(signal_handlers);
579
580 pipe_terminate_ = new Pipe<kPipeThreadTerminator>();
581 const int retval = pthread_create(&thread_listener_, NULL,
582 MainWatchdogListener, this);
583 assert(retval == 0);
584
585 if (spawned_) {
586 // This happens after a reload, when the watchdog process is
587 // already running so we can exit here.
588 return;
589 }
590
591 pipe_watchdog_->Write(ControlFlow::kSupervise);
592 const size_t path_size = crash_dump_path.size();
593 pipe_watchdog_->Write(path_size);
594 if (path_size > 0) {
595 pipe_watchdog_->Write(crash_dump_path.data(), path_size);
596 }
597
598 spawned_ = true;
599 }
600
601
602 void *Watchdog::MainWatchdogListener(void *data) {
603 Watchdog *watchdog = static_cast<Watchdog *>(data);
604 LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener");
605
606 if ((getuid() != 0) && SetuidCapabilityPermitted()) {
607 // Drop all capabilities, none are needed in the listener
608 const std::vector<cap_value_t> nocaps;
609 assert(ClearPermittedCapabilities(nocaps, nocaps));
610 }
611
612 struct pollfd watch_fds[2];
613 watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd();
614 watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents
615 watch_fds[0].revents = 0;
616 watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd();
617 watch_fds[1].events = POLLIN | POLLPRI;
618 watch_fds[1].revents = 0;
619 while (true) {
620 const int retval = poll(watch_fds, 2, -1);
621 if (retval < 0) {
622 continue;
623 }
624
625 // Terminate I/O thread
626 if (watch_fds[1].revents)
627 break;
628
629 if (watch_fds[0].revents) {
630 if ((watch_fds[0].revents & POLLERR) || (watch_fds[0].revents & POLLHUP)
631 || (watch_fds[0].revents & POLLNVAL)) {
632 LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr,
633 "watchdog disappeared, disabling stack trace reporting "
634 "(revents: %d / %d|%d|%d)",
635 watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL);
636 watchdog->SetSignalHandlers(watchdog->old_signal_handlers_);
637 PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting");
638 }
639 PANIC(NULL);
640 }
641 }
642
643 LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener");
644 return NULL;
645 }
646
647
648 void Watchdog::Supervise() {
649 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
650
651 if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) {
652 LogEmergency("watchdog: unexpected termination ("
653 + StringifyInt(control_flow) + ")");
654 if (on_exit_)
655 on_exit_(true /* crashed */);
656 } else {
657 switch (control_flow) {
658 case ControlFlow::kProduceStacktrace:
659 LogEmergency(ReportStacktrace());
660 if (on_exit_)
661 on_exit_(true /* crashed */);
662 break;
663
664 case ControlFlow::kQuitWithExit:
665 if (on_exit_)
666 on_exit_(false /* crashed */);
667 break;
668
669 case ControlFlow::kQuit:
670 break;
671
672 default:
673 LogEmergency("watchdog: unexpected error");
674 break;
675 }
676 }
677 }
678
679
680 /**
681 * Save the state of the watchdog listener thread before reload.
682 */
683 void Watchdog::SaveState(WatchdogState *saved_state) {
684 saved_state->spawned = spawned_;
685 saved_state->pid = watchdog_pid_;
686 if (spawned_) {
687 saved_state->watchdog_write_fd = pipe_watchdog_->GetWriteFd();
688 saved_state->listener_read_fd = pipe_listener_->GetReadFd();
689 }
690 }
691
692
693 /**
694 * Restore the state of the watchdog listener reload
695 */
696 void Watchdog::RestoreState(WatchdogState *saved_state) {
697 watchdog_pid_ = saved_state->pid;
698 if (!saved_state->spawned) {
699 return;
700 }
701 pipe_watchdog_ = new Pipe<kPipeWatchdog>(-1, saved_state->watchdog_write_fd);
702 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(saved_state->listener_read_fd, -1);
703 spawned_ = true;
704 }
705
706
707 49 Watchdog::Watchdog(FnOnExit on_exit)
708 49 : spawned_(false)
709 49 , maintenance_mode_(false)
710
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 , exe_path_(string(platform_getexepath()))
711 49 , watchdog_pid_(0)
712
3/6
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 49 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 49 times.
✗ Branch 10 not taken.
98 , on_exit_(on_exit)
713 {
714 49 const int retval = platform_spinlock_init(&lock_handler_, 0);
715
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(retval == 0);
716 49 memset(&sighandler_stack_, 0, sizeof(sighandler_stack_));
717 49 }
718
719
720 1 Watchdog::~Watchdog() {
721
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (spawned_) {
722 // Reset signal handlers
723 signal(SIGQUIT, SIG_DFL);
724 signal(SIGILL, SIG_DFL);
725 signal(SIGABRT, SIG_DFL);
726 signal(SIGFPE, SIG_DFL);
727 signal(SIGSEGV, SIG_DFL);
728 signal(SIGBUS, SIG_DFL);
729 signal(SIGPIPE, SIG_DFL);
730 signal(SIGXFSZ, SIG_DFL);
731 free(sighandler_stack_.ss_sp);
732 sighandler_stack_.ss_size = 0;
733
734 // The watchdog listener thread exits on any message received
735 pipe_terminate_->Write(ControlFlow::kQuit);
736 pthread_join(thread_listener_, NULL);
737 pipe_terminate_->Close();
738 }
739
740
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (!maintenance_mode_) {
741 // Shutdown the watchdog except when doing a reload
742
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (on_exit_) {
743 pipe_watchdog_->Write(ControlFlow::kQuitWithExit);
744 } else {
745 1 pipe_watchdog_->Write(ControlFlow::kQuit);
746 }
747 1 pipe_watchdog_->CloseWriteFd();
748 1 pipe_listener_->CloseReadFd();
749 } else {
750 // Release the references to the watchdog pipes without closing them
751 pipe_watchdog_.Release();
752 pipe_listener_.Release();
753 }
754
755 1 platform_spinlock_destroy(&lock_handler_);
756 1 LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped");
757 1 instance_ = NULL;
758 1 }
759