GCC Code Coverage Report


Directory: cvmfs/
File: cvmfs/monitor.cc
Date: 2026-04-26 02:35:59
Exec Total Coverage
Lines: 90 363 24.8%
Branches: 67 566 11.8%

Line Branch Exec Source
1 /**
2 * This file is part of the CernVM File System.
3 *
4 * This module forks a watchdog process that listens on
5 * a pipe and prints a stackstrace into syslog, when cvmfs
6 * fails.
7 *
8 * Also, it handles getting and setting the maximum number of file descriptors.
9 */
10
11
12 #include "monitor.h"
13
14 #include <errno.h>
15 #include <execinfo.h>
16 #include <poll.h>
17 #include <pthread.h>
18 #include <signal.h>
19 #include <sys/resource.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <sys/ucontext.h>
23 #else
24 #include <ucontext.h>
25 #endif
26 #include <sys/uio.h>
27 #include <sys/wait.h>
28 #include <syslog.h>
29 #include <time.h>
30 #include <unistd.h>
31
32 #include <cassert>
33 #include <cstdio>
34 #include <cstdlib>
35 #include <cstring>
36 #include <map>
37 #include <set>
38 #include <string>
39 #include <vector>
40
41 #if defined(CVMFS_FUSE_MODULE)
42 #include "cvmfs.h"
43 #endif
44 #include "util/capabilities.h"
45 #include "util/exception.h"
46 #include "util/logging.h"
47 #include "util/posix.h"
48 #include "util/smalloc.h"
49 #include "util/string.h"
50
51 // Used for address offset calculation
52 #if defined(CVMFS_FUSE_MODULE)
53 extern loader::CvmfsExports *g_cvmfs_exports;
54 #endif
55
56 using namespace std; // NOLINT
57
58 Watchdog *Watchdog::instance_ = NULL;
59
60 int Watchdog::g_suppressed_signals[] = {
61 SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGBUS, SIGFPE,
62 SIGUSR1, SIGSEGV, SIGUSR2, SIGTERM, SIGXCPU, SIGXFSZ};
63
64 int Watchdog::g_crash_signals[] = {SIGQUIT, SIGILL, SIGABRT, SIGFPE,
65 SIGSEGV, SIGBUS, SIGPIPE, SIGXFSZ};
66
67 49 Watchdog *Watchdog::Create(FnOnExit on_exit,
68 bool needs_read_environ,
69 WatchdogState *saved_state) {
70
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(instance_ == NULL);
71
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 instance_ = new Watchdog(on_exit);
72
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 if (saved_state != NULL)
73 instance_->RestoreState(saved_state);
74 else
75 49 instance_->Fork(needs_read_environ);
76 49 return instance_;
77 }
78
79
80 /**
81 * Uses an external shell and gdb to create a full stack trace of the dying
82 * process. The same shell is used to force-quit the client afterwards.
83 */
84 string Watchdog::GenerateStackTrace(pid_t pid) {
85 int retval;
86 string result = "";
87
88 // Get capability to ptrace the dead main cvmfs2 process.
89 // This is often necessary because the main process can have its own
90 // elevated capability which would otherwise block ptrace.
91 if (!ObtainSysPtraceCapability()) {
92 result += "failed to gain ptrace capability... still give it a try\n";
93 }
94
95 // run gdb and attach to the dying process
96 int fd_stdin;
97 int fd_stdout;
98 int fd_stderr;
99 vector<string> argv;
100 argv.push_back("-p");
101 argv.push_back(StringifyInt(pid));
102 pid_t gdb_pid = 0;
103 const bool double_fork = false;
104 retval = ExecuteBinary(&fd_stdin,
105 &fd_stdout,
106 &fd_stderr,
107 #ifdef __APPLE__
108 "lldb",
109 #else
110 "gdb",
111 #endif
112 argv,
113 double_fork,
114 &gdb_pid);
115 assert(retval);
116
117
118 // Skip the gdb startup output
119 ReadUntilGdbPrompt(fd_stdout);
120
121 // Send stacktrace command to gdb
122 #ifdef __APPLE__
123 const string gdb_cmd = "bt all\n"
124 "quit\n";
125 #else
126 const string gdb_cmd = "thread apply all bt\n"
127 "quit\n";
128 #endif
129 // The execve can have failed, which can't be detected in ExecuteBinary.
130 // Instead, writing to the pipe will fail.
131 const ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length());
132 if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) {
133 result += "failed to start gdb/lldb (" + StringifyInt(nbytes)
134 + " bytes "
135 "written, errno "
136 + StringifyInt(errno) + ")\n";
137 return result;
138 }
139
140 // Read the stack trace from the stdout of our gdb process
141 #ifdef __APPLE__
142 // lldb has one more prompt
143 result += ReadUntilGdbPrompt(fd_stdout);
144 #endif
145 result += ReadUntilGdbPrompt(fd_stdout) + "\n\n";
146
147 // Check for output on stderr
148 string result_err;
149 Block2Nonblock(fd_stderr);
150 char cbuf;
151 while (read(fd_stderr, &cbuf, 1) == 1)
152 result_err.push_back(cbuf);
153 if (!result_err.empty())
154 result += "\nError output:\n" + result_err + "\n";
155
156 // Close the connection to the terminated gdb process
157 close(fd_stderr);
158 close(fd_stdout);
159 close(fd_stdin);
160
161 // Make sure gdb has terminated (wait for it for a short while)
162 unsigned int timeout = 15;
163 int statloc;
164 while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) {
165 --timeout;
166 SafeSleepMs(1000);
167 }
168
169 // when the timeout expired, gdb probably hangs... we need to kill it
170 if (timeout == 0) {
171 result += "gdb did not exit as expected. sending SIGKILL... ";
172 result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n";
173 }
174
175 return result;
176 }
177
178
179 pid_t Watchdog::GetPid() {
180 if (instance_ != NULL) {
181 return instance_->watchdog_pid_;
182 }
183 return getpid();
184 }
185
186 /**
187 * Log a string to syslog and into the crash dump file.
188 * We expect ideally nothing to be logged, so that file is created on demand.
189 */
190 void Watchdog::LogEmergency(string msg) {
191 char ctime_buffer[32];
192
193 if (!crash_dump_path_.empty()) {
194 FILE *fp = fopen(crash_dump_path_.c_str(), "a");
195 if (fp) {
196 const time_t now = time(NULL);
197 msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer));
198 if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) {
199 msg += " (failed to report into crash dump file " + crash_dump_path_
200 + ")";
201 } else {
202 msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n";
203 }
204 fclose(fp);
205 } else {
206 msg += " (failed to open crash dump file " + crash_dump_path_ + ")";
207 }
208 }
209 LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str());
210 }
211
212 /**
213 * Reads from the file descriptor until the specific gdb prompt is reached or
214 * the pipe gets broken.
215 *
216 * @param fd_pipe the file descriptor of the pipe to be read
217 * @return the data read from the pipe
218 */
219 string Watchdog::ReadUntilGdbPrompt(int fd_pipe) {
220 #ifdef __APPLE__
221 static const string gdb_prompt = "(lldb)";
222 #else
223 static const string gdb_prompt = "\n(gdb) ";
224 #endif
225
226 string result;
227 char mini_buffer;
228 int chars_io;
229 unsigned int ring_buffer_pos = 0;
230
231 // read from stdout of gdb until gdb prompt occurs --> (gdb)
232 while (1) {
233 chars_io = read(fd_pipe, &mini_buffer, 1);
234
235 // in case something goes wrong...
236 if (chars_io <= 0)
237 break;
238
239 result += mini_buffer;
240
241 // find the gdb_promt in the stdout data
242 if (mini_buffer == gdb_prompt[ring_buffer_pos]) {
243 ++ring_buffer_pos;
244 if (ring_buffer_pos == gdb_prompt.size()) {
245 break;
246 }
247 } else {
248 ring_buffer_pos = 0;
249 }
250 }
251
252 return result;
253 }
254
255
256 /**
257 * Generates useful information from the backtrace log in the pipe.
258 */
259 string Watchdog::ReportStacktrace() {
260 CrashData crash_data;
261 if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) {
262 return "failed to read crash data (" + StringifyInt(errno) + ")";
263 }
264
265 string debug = "--\n";
266 debug += "Signal: " + StringifyInt(crash_data.signal);
267 debug += ", errno: " + StringifyInt(crash_data.sys_errno);
268 debug += ", version: " + string(CVMFS_VERSION);
269 debug += ", PID: " + StringifyInt(crash_data.pid) + "\n";
270 debug += "Executable path: " + exe_path_ + "\n";
271
272 debug += GenerateStackTrace(crash_data.pid);
273
274 // Give the dying process the finishing stroke
275 if (kill(crash_data.pid, SIGKILL) != 0) {
276 debug += "Failed to kill cvmfs client! (";
277 switch (errno) {
278 case EINVAL:
279 debug += "invalid signal";
280 break;
281 case EPERM:
282 debug += "permission denied";
283 break;
284 case ESRCH:
285 debug += "no such process";
286 break;
287 default:
288 debug += "unknown error " + StringifyInt(errno);
289 }
290 debug += ")\n\n";
291 }
292
293 return debug;
294 }
295
296
297 void Watchdog::ReportSignalAndContinue(int sig, siginfo_t *siginfo,
298 void * /* context */) {
299 LogCvmfs(kLogMonitor, kLogSyslogErr,
300 "watchdog: received unexpected signal %d from PID %d / UID %d", sig,
301 siginfo->si_pid, siginfo->si_uid);
302 }
303
304
305 void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) {
306 const int send_errno = errno;
307 if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) {
308 // Concurrent call, wait for the first one to exit the process
309 while (true) {
310 }
311 }
312
313 // Set the original signal handler for the raised signal in
314 // SIGQUIT (watchdog process will raise SIGQUIT)
315 (void)sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL);
316
317 // Inform the watchdog that CernVM-FS crashed
318 if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) {
319 _exit(1);
320 }
321
322 // Send crash information to the watchdog
323 CrashData crash_data;
324 crash_data.signal = sig;
325 crash_data.sys_errno = send_errno;
326 crash_data.pid = getpid();
327 if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) {
328 _exit(1);
329 }
330
331 // Do not die before the stack trace was generated
332 // kill -SIGQUIT <pid> will finish this
333 int counter = 0;
334 while (true) {
335 SafeSleepMs(100);
336 // quit anyway after 30 seconds
337 if (++counter == 300) {
338 LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed");
339 // Last attempt to log something useful
340 #if defined(CVMFS_FUSE_MODULE)
341 LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", sig,
342 send_errno);
343 void *addr[kMaxBacktrace];
344 // Note: this doesn't work due to the signal stack on OS X (it works on
345 // Linux). Since anyway lldb is supposed to produce the backtrace, we
346 // consider it more important to protect cvmfs against stack overflows.
347 const int num_addr = backtrace(addr, kMaxBacktrace);
348 char **symbols = backtrace_symbols(addr, num_addr);
349 string backtrace = "Backtrace (" + StringifyInt(num_addr)
350 + " symbols):\n";
351 for (int i = 0; i < num_addr; ++i)
352 backtrace += string(symbols[i]) + "\n";
353 LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str());
354 LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p",
355 &g_cvmfs_exports);
356 #endif
357
358 _exit(1);
359 }
360 }
361
362 _exit(1);
363 }
364
365
366 /**
367 * Sets the signal handlers of the current process according to the ones
368 * defined in the given SigactionMap.
369 *
370 * @param signal_handlers a map of SIGNAL -> struct sigaction
371 * @return a SigactionMap containing the old handlers
372 */
373 49 Watchdog::SigactionMap Watchdog::SetSignalHandlers(
374 const SigactionMap &signal_handlers) {
375 49 SigactionMap old_signal_handlers;
376 49 SigactionMap::const_iterator i = signal_handlers.begin();
377 49 const SigactionMap::const_iterator iend = signal_handlers.end();
378
2/2
✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
686 for (; i != iend; ++i) {
379 struct sigaction old_signal_handler;
380
1/2
✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
637 if (sigaction(i->first, &i->second, &old_signal_handler) != 0) {
381 PANIC(NULL);
382 }
383
1/2
✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
637 old_signal_handlers[i->first] = old_signal_handler;
384 }
385
386 98 return old_signal_handlers;
387 }
388
389
390 /**
391 * Fork the watchdog process and put it on hold until Spawn() is called.
392 */
393 49 void Watchdog::Fork(bool needs_read_environ) {
394
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 Pipe<kPipeWatchdogPid> pipe_pid;
395
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_watchdog_ = new Pipe<kPipeWatchdog>();
396
3/6
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
49 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>();
397
398 pid_t pid;
399 int statloc;
400
1/3
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 switch (pid = fork()) {
401 case -1:
402 PANIC(NULL);
403 case 0:
404 // Double fork to avoid zombie
405 switch (fork()) {
406 case -1:
407 _exit(1);
408 case 0: {
409 pipe_watchdog_->CloseWriteFd();
410 Daemonize();
411
4/8
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 49 times.
49 if ((geteuid() != 0) && SetuidCapabilityPermitted()) {
412 const std::vector<cap_value_t> nocaps;
413 if (on_exit_) {
414 // Reduce to minimum capabilities, which unfortunately is
415 // still quite powerful.
416 // CAP_SYS_ADMIN is needed to unmount, and CAP_SYS_PTRACE
417 // is needed when needs_read_environ is true because then
418 // the main Fuse process has elevated capabilities and
419 // ptrace (needed for collecting a stack trace) is not
420 // allowed on a process with more capabilities.
421 if (needs_read_environ) {
422 const std::vector<cap_value_t> reservecaps = {CAP_SYS_ADMIN, CAP_SYS_PTRACE};
423 const std::vector<cap_value_t> inheritcaps = {CAP_SYS_PTRACE};
424 assert(ClearPermittedCapabilities(reservecaps, inheritcaps));
425 } else {
426 const std::vector<cap_value_t> reservecaps = {CAP_SYS_ADMIN};
427 assert(ClearPermittedCapabilities(reservecaps, nocaps));
428 }
429 } else {
430 // Only need to be able to do the stack trace, and the
431 // main process needs no extra capabilities, so we can
432 // drop all capabilities.
433 assert(ClearPermittedCapabilities(nocaps, nocaps));
434 }
435 }
436 // send the watchdog PID to the supervisee
437 49 const pid_t watchdog_pid = getpid();
438
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Write(watchdog_pid);
439
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
440 // Close all unused file descriptors
441 // close also usyslog, only get it back if necessary
442 // string usyslog_save = GetLogMicroSyslog();
443
1/3
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
49 const string debuglog_save = GetLogDebugFile();
444
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogDebugFile("");
445
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 const string usyslog_save = GetLogMicroSyslog();
446
2/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
49 SetLogMicroSyslog("");
447 // Gracefully close the syslog before closing all fds. The next call
448 // to syslog will reopen it.
449
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 closelog();
450 // Let's keep stdin, stdout, stderr open at /dev/null (daemonized)
451 // in order to prevent accidental outputs from messing with another
452 // file descriptor
453 49 std::set<int> preserve_fds;
454
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(0);
455
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(1);
456
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 preserve_fds.insert(2);
457
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_watchdog_->GetReadFd());
458
1/2
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
49 preserve_fds.insert(pipe_listener_->GetWriteFd());
459
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 CloseAllFildes(preserve_fds);
460
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used
461
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetLogDebugFile(debuglog_save); // no-op if debug log not used
462
463
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (WaitForSupervisee())
464 Supervise();
465
466
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
467
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
468 49 exit(0);
469 }
470 default:
471 _exit(0);
472 }
473 49 default:
474
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_watchdog_->CloseReadFd();
475
1/2
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
49 pipe_listener_->CloseWriteFd();
476
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseWriteFd();
477
2/4
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
49 if (waitpid(pid, &statloc, 0) != pid)
478 PANIC(NULL);
479
2/4
✓ Branch 0 taken 49 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
49 if (!WIFEXITED(statloc) || WEXITSTATUS(statloc))
480 PANIC(NULL);
481 }
482
483 // retrieve the watchdog PID from the pipe
484
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.Read(&watchdog_pid_);
485
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 pipe_pid.CloseReadFd();
486 49 }
487
488
489 49 bool Watchdog::WaitForSupervisee() {
490 // We want broken pipes not to raise a signal but handle the error in the
491 // read/write code
492 49 platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN);
493
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(rv_sig != SIG_ERR);
494
495 // The watchdog is not supposed to receive signals. If it does, report it.
496 struct sigaction sa;
497 49 memset(&sa, 0, sizeof(sa));
498 49 sa.sa_sigaction = ReportSignalAndContinue;
499 49 sa.sa_flags = SA_SIGINFO;
500 49 sigfillset(&sa.sa_mask);
501
502 49 SigactionMap signal_handlers;
503
2/2
✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
686 for (size_t i = 0; i < sizeof(g_suppressed_signals) / sizeof(int); i++) {
504
1/2
✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
637 signal_handlers[g_suppressed_signals[i]] = sa;
505 }
506
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 SetSignalHandlers(signal_handlers);
507
508 49 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
509
510
3/4
✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
49 if (!pipe_watchdog_->TryRead(&control_flow)) {
511
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog");
512 48 return false;
513 }
514
515
1/4
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1 switch (control_flow) {
516 1 case ControlFlow::kQuit:
517 1 return false;
518 case ControlFlow::kQuitWithExit:
519 if (on_exit_) on_exit_(false);
520 return false;
521 case ControlFlow::kSupervise:
522 break;
523 default:
524 LogEmergency("Internal error: invalid control flow");
525 return false;
526 }
527
528 size_t size;
529 pipe_watchdog_->Read(&size);
530 crash_dump_path_.resize(size);
531 if (size > 0) {
532 pipe_watchdog_->Read(&crash_dump_path_[0], size);
533
534 const int retval = chdir(GetParentPath(crash_dump_path_).c_str());
535 if (retval != 0) {
536 LogEmergency(std::string("Cannot change to crash dump directory: ")
537 + crash_dump_path_);
538 return false;
539 }
540 crash_dump_path_ = GetFileName(crash_dump_path_);
541 }
542 return true;
543 49 }
544
545
546 /**
547 * Set up the signal handling and kick off the supervision.
548 */
549 void Watchdog::Spawn(const std::string &crash_dump_path) {
550 // lower restrictions for ptrace
551 if (!platform_allow_ptrace(watchdog_pid_)) {
552 LogCvmfs(kLogMonitor, kLogSyslogWarn,
553 "failed to allow ptrace() for watchdog (PID: %d). "
554 "Post crash stacktrace might not work",
555 watchdog_pid_);
556 }
557
558 // Extra stack for signal handlers
559 const int stack_size = kSignalHandlerStacksize; // 2 MB
560 sighandler_stack_.ss_sp = smalloc(stack_size);
561 sighandler_stack_.ss_size = stack_size;
562 sighandler_stack_.ss_flags = 0;
563 if (sigaltstack(&sighandler_stack_, NULL) != 0)
564 PANIC(NULL);
565
566 // define our crash signal handler
567 struct sigaction sa;
568 memset(&sa, 0, sizeof(sa));
569 sa.sa_sigaction = SendTrace;
570 sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
571 sigfillset(&sa.sa_mask);
572
573 SigactionMap signal_handlers;
574 for (size_t i = 0; i < sizeof(g_crash_signals) / sizeof(int); i++) {
575 signal_handlers[g_crash_signals[i]] = sa;
576 }
577 old_signal_handlers_ = SetSignalHandlers(signal_handlers);
578
579 pipe_terminate_ = new Pipe<kPipeThreadTerminator>();
580 const int retval = pthread_create(&thread_listener_, NULL,
581 MainWatchdogListener, this);
582 assert(retval == 0);
583
584 if (spawned_) {
585 // This happens after a reload, when the watchdog process is
586 // already running so we can exit here.
587 return;
588 }
589
590 pipe_watchdog_->Write(ControlFlow::kSupervise);
591 const size_t path_size = crash_dump_path.size();
592 pipe_watchdog_->Write(path_size);
593 if (path_size > 0) {
594 pipe_watchdog_->Write(crash_dump_path.data(), path_size);
595 }
596
597 spawned_ = true;
598 }
599
600
601 void *Watchdog::MainWatchdogListener(void *data) {
602 Watchdog *watchdog = static_cast<Watchdog *>(data);
603 LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener");
604
605 if ((getuid() != 0) && SetuidCapabilityPermitted()) {
606 // Drop all capabilities, none are needed in the listener
607 const std::vector<cap_value_t> nocaps;
608 assert(ClearPermittedCapabilities(nocaps, nocaps));
609 }
610
611 struct pollfd watch_fds[2];
612 watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd();
613 watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents
614 watch_fds[0].revents = 0;
615 watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd();
616 watch_fds[1].events = POLLIN | POLLPRI;
617 watch_fds[1].revents = 0;
618 while (true) {
619 const int retval = poll(watch_fds, 2, -1);
620 if (retval < 0) {
621 continue;
622 }
623
624 // Terminate I/O thread
625 if (watch_fds[1].revents)
626 break;
627
628 if (watch_fds[0].revents) {
629 if ((watch_fds[0].revents & POLLERR) || (watch_fds[0].revents & POLLHUP)
630 || (watch_fds[0].revents & POLLNVAL)) {
631 LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr,
632 "watchdog disappeared, disabling stack trace reporting "
633 "(revents: %d / %d|%d|%d)",
634 watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL);
635 watchdog->SetSignalHandlers(watchdog->old_signal_handlers_);
636 PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting");
637 }
638 PANIC(NULL);
639 }
640 }
641
642 LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener");
643 return NULL;
644 }
645
646
647 void Watchdog::Supervise() {
648 ControlFlow::Flags control_flow = ControlFlow::kUnknown;
649
650 if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) {
651 LogEmergency("watchdog: unexpected termination ("
652 + StringifyInt(control_flow) + ")");
653 if (on_exit_)
654 on_exit_(true /* crashed */);
655 } else {
656 switch (control_flow) {
657 case ControlFlow::kProduceStacktrace:
658 LogEmergency(ReportStacktrace());
659 if (on_exit_)
660 on_exit_(true /* crashed */);
661 break;
662
663 case ControlFlow::kQuitWithExit:
664 if (on_exit_)
665 on_exit_(false /* crashed */);
666 break;
667
668 case ControlFlow::kQuit:
669 break;
670
671 default:
672 LogEmergency("watchdog: unexpected error");
673 break;
674 }
675 }
676 }
677
678
679 /**
680 * Save the state of the watchdog listener thread before reload.
681 */
682 void Watchdog::SaveState(WatchdogState *saved_state) {
683 saved_state->spawned = spawned_;
684 saved_state->pid = watchdog_pid_;
685 if (spawned_) {
686 saved_state->watchdog_write_fd = pipe_watchdog_->GetWriteFd();
687 saved_state->listener_read_fd = pipe_listener_->GetReadFd();
688 }
689 }
690
691
692 /**
693 * Restore the state of the watchdog listener reload
694 */
695 void Watchdog::RestoreState(WatchdogState *saved_state) {
696 watchdog_pid_ = saved_state->pid;
697 if (!saved_state->spawned) {
698 return;
699 }
700 pipe_watchdog_ = new Pipe<kPipeWatchdog>(-1, saved_state->watchdog_write_fd);
701 pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(saved_state->listener_read_fd, -1);
702 spawned_ = true;
703 }
704
705
706 49 Watchdog::Watchdog(FnOnExit on_exit)
707 49 : spawned_(false)
708 49 , maintenance_mode_(false)
709
1/2
✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
49 , exe_path_(string(platform_getexepath()))
710 49 , watchdog_pid_(0)
711
3/6
✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 49 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 49 times.
✗ Branch 10 not taken.
98 , on_exit_(on_exit)
712 {
713 49 const int retval = platform_spinlock_init(&lock_handler_, 0);
714
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
49 assert(retval == 0);
715 49 memset(&sighandler_stack_, 0, sizeof(sighandler_stack_));
716 49 }
717
718
719 1 Watchdog::~Watchdog() {
720
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (spawned_) {
721 // Reset signal handlers
722 signal(SIGQUIT, SIG_DFL);
723 signal(SIGILL, SIG_DFL);
724 signal(SIGABRT, SIG_DFL);
725 signal(SIGFPE, SIG_DFL);
726 signal(SIGSEGV, SIG_DFL);
727 signal(SIGBUS, SIG_DFL);
728 signal(SIGPIPE, SIG_DFL);
729 signal(SIGXFSZ, SIG_DFL);
730 free(sighandler_stack_.ss_sp);
731 sighandler_stack_.ss_size = 0;
732
733 // The watchdog listener thread exits on any message received
734 pipe_terminate_->Write(ControlFlow::kQuit);
735 pthread_join(thread_listener_, NULL);
736 pipe_terminate_->Close();
737 }
738
739
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (!maintenance_mode_) {
740 // Shutdown the watchdog except when doing a reload
741
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (on_exit_) {
742 pipe_watchdog_->Write(ControlFlow::kQuitWithExit);
743 } else {
744 1 pipe_watchdog_->Write(ControlFlow::kQuit);
745 }
746 1 pipe_watchdog_->CloseWriteFd();
747 1 pipe_listener_->CloseReadFd();
748 } else {
749 // Release the references to the watchdog pipes without closing them
750 pipe_watchdog_.Release();
751 pipe_listener_.Release();
752 }
753
754 1 platform_spinlock_destroy(&lock_handler_);
755 1 LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped");
756 1 instance_ = NULL;
757 1 }
758