| Directory: | cvmfs/ |
|---|---|
| File: | cvmfs/monitor.cc |
| Date: | 2025-11-09 02:35:23 |
| Exec | Total | Coverage | |
|---|---|---|---|
| Lines: | 85 | 320 | 26.6% |
| Branches: | 60 | 500 | 12.0% |
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /** | ||
| 2 | * This file is part of the CernVM File System. | ||
| 3 | * | ||
| 4 | * This module forks a watchdog process that listens on | ||
| 5 | * a pipe and prints a stackstrace into syslog, when cvmfs | ||
| 6 | * fails. | ||
| 7 | * | ||
| 8 | * Also, it handles getting and setting the maximum number of file descriptors. | ||
| 9 | */ | ||
| 10 | |||
| 11 | |||
| 12 | #include "monitor.h" | ||
| 13 | |||
| 14 | #include <errno.h> | ||
| 15 | #include <execinfo.h> | ||
| 16 | #include <poll.h> | ||
| 17 | #include <pthread.h> | ||
| 18 | #include <signal.h> | ||
| 19 | #include <sys/resource.h> | ||
| 20 | #include <sys/types.h> | ||
| 21 | #ifdef __APPLE__ | ||
| 22 | #include <sys/ucontext.h> | ||
| 23 | #else | ||
| 24 | #include <ucontext.h> | ||
| 25 | #endif | ||
| 26 | #include <sys/uio.h> | ||
| 27 | #include <sys/wait.h> | ||
| 28 | #include <syslog.h> | ||
| 29 | #include <time.h> | ||
| 30 | #include <unistd.h> | ||
| 31 | |||
| 32 | #include <cassert> | ||
| 33 | #include <cstdio> | ||
| 34 | #include <cstdlib> | ||
| 35 | #include <cstring> | ||
| 36 | #include <map> | ||
| 37 | #include <set> | ||
| 38 | #include <string> | ||
| 39 | #include <vector> | ||
| 40 | |||
| 41 | #if defined(CVMFS_FUSE_MODULE) | ||
| 42 | #include "cvmfs.h" | ||
| 43 | #endif | ||
| 44 | #include "util/exception.h" | ||
| 45 | #include "util/logging.h" | ||
| 46 | #include "util/platform.h" | ||
| 47 | #include "util/posix.h" | ||
| 48 | #include "util/smalloc.h" | ||
| 49 | #include "util/string.h" | ||
| 50 | |||
| 51 | // Used for address offset calculation | ||
| 52 | #if defined(CVMFS_FUSE_MODULE) | ||
| 53 | extern loader::CvmfsExports *g_cvmfs_exports; | ||
| 54 | #endif | ||
| 55 | |||
| 56 | using namespace std; // NOLINT | ||
| 57 | |||
| 58 | Watchdog *Watchdog::instance_ = NULL; | ||
| 59 | |||
| 60 | int Watchdog::g_suppressed_signals[] = { | ||
| 61 | SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGBUS, SIGFPE, | ||
| 62 | SIGUSR1, SIGSEGV, SIGUSR2, SIGTERM, SIGXCPU, SIGXFSZ}; | ||
| 63 | |||
| 64 | int Watchdog::g_crash_signals[] = {SIGQUIT, SIGILL, SIGABRT, SIGFPE, | ||
| 65 | SIGSEGV, SIGBUS, SIGPIPE, SIGXFSZ}; | ||
| 66 | |||
| 67 | 49 | Watchdog *Watchdog::Create(FnOnCrash on_crash) { | |
| 68 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(instance_ == NULL); |
| 69 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | instance_ = new Watchdog(on_crash); |
| 70 | 49 | instance_->Fork(); | |
| 71 | 49 | return instance_; | |
| 72 | } | ||
| 73 | |||
| 74 | |||
| 75 | /** | ||
| 76 | * Uses an external shell and gdb to create a full stack trace of the dying | ||
| 77 | * process. The same shell is used to force-quit the client afterwards. | ||
| 78 | */ | ||
| 79 | ✗ | string Watchdog::GenerateStackTrace(pid_t pid) { | |
| 80 | int retval; | ||
| 81 | ✗ | string result = ""; | |
| 82 | |||
| 83 | // re-gain root permissions to allow for ptrace of died cvmfs2 process | ||
| 84 | ✗ | const bool retrievable = true; | |
| 85 | ✗ | if (!SwitchCredentials(0, getgid(), retrievable)) { | |
| 86 | ✗ | result += "failed to re-gain root permissions... still give it a try\n"; | |
| 87 | } | ||
| 88 | |||
| 89 | // run gdb and attach to the dying process | ||
| 90 | int fd_stdin; | ||
| 91 | int fd_stdout; | ||
| 92 | int fd_stderr; | ||
| 93 | ✗ | vector<string> argv; | |
| 94 | ✗ | argv.push_back("-p"); | |
| 95 | ✗ | argv.push_back(StringifyInt(pid)); | |
| 96 | ✗ | pid_t gdb_pid = 0; | |
| 97 | ✗ | const bool double_fork = false; | |
| 98 | ✗ | retval = ExecuteBinary(&fd_stdin, | |
| 99 | &fd_stdout, | ||
| 100 | &fd_stderr, | ||
| 101 | #ifdef __APPLE__ | ||
| 102 | "lldb", | ||
| 103 | #else | ||
| 104 | "gdb", | ||
| 105 | #endif | ||
| 106 | argv, | ||
| 107 | double_fork, | ||
| 108 | &gdb_pid); | ||
| 109 | ✗ | assert(retval); | |
| 110 | |||
| 111 | |||
| 112 | // Skip the gdb startup output | ||
| 113 | ✗ | ReadUntilGdbPrompt(fd_stdout); | |
| 114 | |||
| 115 | // Send stacktrace command to gdb | ||
| 116 | #ifdef __APPLE__ | ||
| 117 | const string gdb_cmd = "bt all\n" | ||
| 118 | "quit\n"; | ||
| 119 | #else | ||
| 120 | const string gdb_cmd = "thread apply all bt\n" | ||
| 121 | ✗ | "quit\n"; | |
| 122 | #endif | ||
| 123 | // The execve can have failed, which can't be detected in ExecuteBinary. | ||
| 124 | // Instead, writing to the pipe will fail. | ||
| 125 | ✗ | const ssize_t nbytes = write(fd_stdin, gdb_cmd.data(), gdb_cmd.length()); | |
| 126 | ✗ | if ((nbytes < 0) || (static_cast<unsigned>(nbytes) != gdb_cmd.length())) { | |
| 127 | ✗ | result += "failed to start gdb/lldb (" + StringifyInt(nbytes) | |
| 128 | ✗ | + " bytes " | |
| 129 | "written, errno " | ||
| 130 | ✗ | + StringifyInt(errno) + ")\n"; | |
| 131 | ✗ | return result; | |
| 132 | } | ||
| 133 | |||
| 134 | // Read the stack trace from the stdout of our gdb process | ||
| 135 | #ifdef __APPLE__ | ||
| 136 | // lldb has one more prompt | ||
| 137 | result += ReadUntilGdbPrompt(fd_stdout); | ||
| 138 | #endif | ||
| 139 | ✗ | result += ReadUntilGdbPrompt(fd_stdout) + "\n\n"; | |
| 140 | |||
| 141 | // Check for output on stderr | ||
| 142 | ✗ | string result_err; | |
| 143 | ✗ | Block2Nonblock(fd_stderr); | |
| 144 | char cbuf; | ||
| 145 | ✗ | while (read(fd_stderr, &cbuf, 1) == 1) | |
| 146 | ✗ | result_err.push_back(cbuf); | |
| 147 | ✗ | if (!result_err.empty()) | |
| 148 | ✗ | result += "\nError output:\n" + result_err + "\n"; | |
| 149 | |||
| 150 | // Close the connection to the terminated gdb process | ||
| 151 | ✗ | close(fd_stderr); | |
| 152 | ✗ | close(fd_stdout); | |
| 153 | ✗ | close(fd_stdin); | |
| 154 | |||
| 155 | // Make sure gdb has terminated (wait for it for a short while) | ||
| 156 | ✗ | unsigned int timeout = 15; | |
| 157 | int statloc; | ||
| 158 | ✗ | while (timeout > 0 && waitpid(gdb_pid, &statloc, WNOHANG) != gdb_pid) { | |
| 159 | ✗ | --timeout; | |
| 160 | ✗ | SafeSleepMs(1000); | |
| 161 | } | ||
| 162 | |||
| 163 | // when the timeout expired, gdb probably hangs... we need to kill it | ||
| 164 | ✗ | if (timeout == 0) { | |
| 165 | ✗ | result += "gdb did not exit as expected. sending SIGKILL... "; | |
| 166 | ✗ | result += (kill(gdb_pid, SIGKILL) != 0) ? "failed\n" : "okay\n"; | |
| 167 | } | ||
| 168 | |||
| 169 | ✗ | return result; | |
| 170 | } | ||
| 171 | |||
| 172 | |||
| 173 | ✗ | pid_t Watchdog::GetPid() { | |
| 174 | ✗ | if (instance_ != NULL) { | |
| 175 | ✗ | return instance_->watchdog_pid_; | |
| 176 | } | ||
| 177 | ✗ | return getpid(); | |
| 178 | } | ||
| 179 | |||
| 180 | /** | ||
| 181 | * Log a string to syslog and into the crash dump file. | ||
| 182 | * We expect ideally nothing to be logged, so that file is created on demand. | ||
| 183 | */ | ||
| 184 | ✗ | void Watchdog::LogEmergency(string msg) { | |
| 185 | char ctime_buffer[32]; | ||
| 186 | |||
| 187 | ✗ | if (!crash_dump_path_.empty()) { | |
| 188 | ✗ | FILE *fp = fopen(crash_dump_path_.c_str(), "a"); | |
| 189 | ✗ | if (fp) { | |
| 190 | ✗ | const time_t now = time(NULL); | |
| 191 | ✗ | msg += "\nTimestamp: " + string(ctime_r(&now, ctime_buffer)); | |
| 192 | ✗ | if (fwrite(&msg[0], 1, msg.length(), fp) != msg.length()) { | |
| 193 | ✗ | msg += " (failed to report into crash dump file " + crash_dump_path_ | |
| 194 | ✗ | + ")"; | |
| 195 | } else { | ||
| 196 | ✗ | msg += "\n Crash logged also on file: " + crash_dump_path_ + "\n"; | |
| 197 | } | ||
| 198 | ✗ | fclose(fp); | |
| 199 | } else { | ||
| 200 | ✗ | msg += " (failed to open crash dump file " + crash_dump_path_ + ")"; | |
| 201 | } | ||
| 202 | } | ||
| 203 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, "%s", msg.c_str()); | |
| 204 | } | ||
| 205 | |||
| 206 | /** | ||
| 207 | * Reads from the file descriptor until the specific gdb prompt is reached or | ||
| 208 | * the pipe gets broken. | ||
| 209 | * | ||
| 210 | * @param fd_pipe the file descriptor of the pipe to be read | ||
| 211 | * @return the data read from the pipe | ||
| 212 | */ | ||
| 213 | ✗ | string Watchdog::ReadUntilGdbPrompt(int fd_pipe) { | |
| 214 | #ifdef __APPLE__ | ||
| 215 | static const string gdb_prompt = "(lldb)"; | ||
| 216 | #else | ||
| 217 | ✗ | static const string gdb_prompt = "\n(gdb) "; | |
| 218 | #endif | ||
| 219 | |||
| 220 | ✗ | string result; | |
| 221 | char mini_buffer; | ||
| 222 | int chars_io; | ||
| 223 | ✗ | unsigned int ring_buffer_pos = 0; | |
| 224 | |||
| 225 | // read from stdout of gdb until gdb prompt occurs --> (gdb) | ||
| 226 | while (1) { | ||
| 227 | ✗ | chars_io = read(fd_pipe, &mini_buffer, 1); | |
| 228 | |||
| 229 | // in case something goes wrong... | ||
| 230 | ✗ | if (chars_io <= 0) | |
| 231 | ✗ | break; | |
| 232 | |||
| 233 | ✗ | result += mini_buffer; | |
| 234 | |||
| 235 | // find the gdb_promt in the stdout data | ||
| 236 | ✗ | if (mini_buffer == gdb_prompt[ring_buffer_pos]) { | |
| 237 | ✗ | ++ring_buffer_pos; | |
| 238 | ✗ | if (ring_buffer_pos == gdb_prompt.size()) { | |
| 239 | ✗ | break; | |
| 240 | } | ||
| 241 | } else { | ||
| 242 | ✗ | ring_buffer_pos = 0; | |
| 243 | } | ||
| 244 | } | ||
| 245 | |||
| 246 | ✗ | return result; | |
| 247 | } | ||
| 248 | |||
| 249 | |||
| 250 | /** | ||
| 251 | * Generates useful information from the backtrace log in the pipe. | ||
| 252 | */ | ||
| 253 | ✗ | string Watchdog::ReportStacktrace() { | |
| 254 | CrashData crash_data; | ||
| 255 | ✗ | if (!pipe_watchdog_->TryRead<CrashData>(&crash_data)) { | |
| 256 | ✗ | return "failed to read crash data (" + StringifyInt(errno) + ")"; | |
| 257 | } | ||
| 258 | |||
| 259 | ✗ | string debug = "--\n"; | |
| 260 | ✗ | debug += "Signal: " + StringifyInt(crash_data.signal); | |
| 261 | ✗ | debug += ", errno: " + StringifyInt(crash_data.sys_errno); | |
| 262 | ✗ | debug += ", version: " + string(CVMFS_VERSION); | |
| 263 | ✗ | debug += ", PID: " + StringifyInt(crash_data.pid) + "\n"; | |
| 264 | ✗ | debug += "Executable path: " + exe_path_ + "\n"; | |
| 265 | |||
| 266 | ✗ | debug += GenerateStackTrace(crash_data.pid); | |
| 267 | |||
| 268 | // Give the dying process the finishing stroke | ||
| 269 | ✗ | if (kill(crash_data.pid, SIGKILL) != 0) { | |
| 270 | ✗ | debug += "Failed to kill cvmfs client! ("; | |
| 271 | ✗ | switch (errno) { | |
| 272 | ✗ | case EINVAL: | |
| 273 | ✗ | debug += "invalid signal"; | |
| 274 | ✗ | break; | |
| 275 | ✗ | case EPERM: | |
| 276 | ✗ | debug += "permission denied"; | |
| 277 | ✗ | break; | |
| 278 | ✗ | case ESRCH: | |
| 279 | ✗ | debug += "no such process"; | |
| 280 | ✗ | break; | |
| 281 | ✗ | default: | |
| 282 | ✗ | debug += "unknown error " + StringifyInt(errno); | |
| 283 | } | ||
| 284 | ✗ | debug += ")\n\n"; | |
| 285 | } | ||
| 286 | |||
| 287 | ✗ | return debug; | |
| 288 | } | ||
| 289 | |||
| 290 | |||
| 291 | ✗ | void Watchdog::ReportSignalAndContinue(int sig, siginfo_t *siginfo, | |
| 292 | void * /* context */) { | ||
| 293 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogErr, | |
| 294 | "watchdog: received unexpected signal %d from PID %d / UID %d", sig, | ||
| 295 | siginfo->si_pid, siginfo->si_uid); | ||
| 296 | } | ||
| 297 | |||
| 298 | |||
| 299 | ✗ | void Watchdog::SendTrace(int sig, siginfo_t *siginfo, void *context) { | |
| 300 | ✗ | const int send_errno = errno; | |
| 301 | ✗ | if (platform_spinlock_trylock(&Me()->lock_handler_) != 0) { | |
| 302 | // Concurrent call, wait for the first one to exit the process | ||
| 303 | ✗ | while (true) { | |
| 304 | } | ||
| 305 | } | ||
| 306 | |||
| 307 | // Set the original signal handler for the raised signal in | ||
| 308 | // SIGQUIT (watchdog process will raise SIGQUIT) | ||
| 309 | ✗ | (void)sigaction(SIGQUIT, &(Me()->old_signal_handlers_[sig]), NULL); | |
| 310 | |||
| 311 | // Inform the watchdog that CernVM-FS crashed | ||
| 312 | ✗ | if (!Me()->pipe_watchdog_->Write(ControlFlow::kProduceStacktrace)) { | |
| 313 | ✗ | _exit(1); | |
| 314 | } | ||
| 315 | |||
| 316 | // Send crash information to the watchdog | ||
| 317 | CrashData crash_data; | ||
| 318 | ✗ | crash_data.signal = sig; | |
| 319 | ✗ | crash_data.sys_errno = send_errno; | |
| 320 | ✗ | crash_data.pid = getpid(); | |
| 321 | ✗ | if (!Me()->pipe_watchdog_->Write<CrashData>(crash_data)) { | |
| 322 | ✗ | _exit(1); | |
| 323 | } | ||
| 324 | |||
| 325 | // Do not die before the stack trace was generated | ||
| 326 | // kill -SIGQUIT <pid> will finish this | ||
| 327 | ✗ | int counter = 0; | |
| 328 | while (true) { | ||
| 329 | ✗ | SafeSleepMs(100); | |
| 330 | // quit anyway after 30 seconds | ||
| 331 | ✗ | if (++counter == 300) { | |
| 332 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "stack trace generation failed"); | |
| 333 | // Last attempt to log something useful | ||
| 334 | #if defined(CVMFS_FUSE_MODULE) | ||
| 335 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "Signal %d, errno %d", sig, | |
| 336 | send_errno); | ||
| 337 | void *addr[kMaxBacktrace]; | ||
| 338 | // Note: this doesn't work due to the signal stack on OS X (it works on | ||
| 339 | // Linux). Since anyway lldb is supposed to produce the backtrace, we | ||
| 340 | // consider it more important to protect cvmfs against stack overflows. | ||
| 341 | ✗ | const int num_addr = backtrace(addr, kMaxBacktrace); | |
| 342 | ✗ | char **symbols = backtrace_symbols(addr, num_addr); | |
| 343 | ✗ | string backtrace = "Backtrace (" + StringifyInt(num_addr) | |
| 344 | ✗ | + " symbols):\n"; | |
| 345 | ✗ | for (int i = 0; i < num_addr; ++i) | |
| 346 | ✗ | backtrace += string(symbols[i]) + "\n"; | |
| 347 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "%s", backtrace.c_str()); | |
| 348 | ✗ | LogCvmfs(kLogCvmfs, kLogSyslogErr, "address of g_cvmfs_exports: %p", | |
| 349 | &g_cvmfs_exports); | ||
| 350 | #endif | ||
| 351 | |||
| 352 | ✗ | _exit(1); | |
| 353 | } | ||
| 354 | } | ||
| 355 | |||
| 356 | _exit(1); | ||
| 357 | } | ||
| 358 | |||
| 359 | |||
| 360 | /** | ||
| 361 | * Sets the signal handlers of the current process according to the ones | ||
| 362 | * defined in the given SigactionMap. | ||
| 363 | * | ||
| 364 | * @param signal_handlers a map of SIGNAL -> struct sigaction | ||
| 365 | * @return a SigactionMap containing the old handlers | ||
| 366 | */ | ||
| 367 | 49 | Watchdog::SigactionMap Watchdog::SetSignalHandlers( | |
| 368 | const SigactionMap &signal_handlers) { | ||
| 369 | 49 | SigactionMap old_signal_handlers; | |
| 370 | 49 | SigactionMap::const_iterator i = signal_handlers.begin(); | |
| 371 | 49 | const SigactionMap::const_iterator iend = signal_handlers.end(); | |
| 372 |
2/2✓ Branch 1 taken 637 times.
✓ Branch 2 taken 49 times.
|
686 | for (; i != iend; ++i) { |
| 373 | struct sigaction old_signal_handler; | ||
| 374 |
1/2✗ Branch 3 not taken.
✓ Branch 4 taken 637 times.
|
637 | if (sigaction(i->first, &i->second, &old_signal_handler) != 0) { |
| 375 | ✗ | PANIC(NULL); | |
| 376 | } | ||
| 377 |
1/2✓ Branch 2 taken 637 times.
✗ Branch 3 not taken.
|
637 | old_signal_handlers[i->first] = old_signal_handler; |
| 378 | } | ||
| 379 | |||
| 380 | 98 | return old_signal_handlers; | |
| 381 | } | ||
| 382 | |||
| 383 | |||
| 384 | /** | ||
| 385 | * Fork the watchdog process and put it on hold until Spawn() is called. | ||
| 386 | */ | ||
| 387 | 49 | void Watchdog::Fork() { | |
| 388 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | Pipe<kPipeWatchdogPid> pipe_pid; |
| 389 |
3/6✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
|
49 | pipe_watchdog_ = new Pipe<kPipeWatchdog>(); |
| 390 |
3/6✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 49 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 49 times.
✗ Branch 8 not taken.
|
49 | pipe_listener_ = new Pipe<kPipeWatchdogSupervisor>(); |
| 391 | |||
| 392 | pid_t pid; | ||
| 393 | int statloc; | ||
| 394 |
1/3✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
|
49 | switch (pid = fork()) { |
| 395 | ✗ | case -1: | |
| 396 | ✗ | PANIC(NULL); | |
| 397 | ✗ | case 0: | |
| 398 | // Double fork to avoid zombie | ||
| 399 | ✗ | switch (fork()) { | |
| 400 | ✗ | case -1: | |
| 401 | ✗ | _exit(1); | |
| 402 | ✗ | case 0: { | |
| 403 | ✗ | pipe_watchdog_->CloseWriteFd(); | |
| 404 | ✗ | Daemonize(); | |
| 405 | // send the watchdog PID to the supervisee | ||
| 406 | 49 | const pid_t watchdog_pid = getpid(); | |
| 407 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.Write(watchdog_pid); |
| 408 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseWriteFd(); |
| 409 | // Close all unused file descriptors | ||
| 410 | // close also usyslog, only get it back if necessary | ||
| 411 | // string usyslog_save = GetLogMicroSyslog(); | ||
| 412 |
1/3✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
49 | const string debuglog_save = GetLogDebugFile(); |
| 413 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogDebugFile(""); |
| 414 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | const string usyslog_save = GetLogMicroSyslog(); |
| 415 |
2/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 49 times.
✗ Branch 6 not taken.
|
49 | SetLogMicroSyslog(""); |
| 416 | // Gracefully close the syslog before closing all fds. The next call | ||
| 417 | // to syslog will reopen it. | ||
| 418 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | closelog(); |
| 419 | // Let's keep stdin, stdout, stderr open at /dev/null (daemonized) | ||
| 420 | // in order to prevent accidental outputs from messing with another | ||
| 421 | // file descriptor | ||
| 422 | 49 | std::set<int> preserve_fds; | |
| 423 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(0); |
| 424 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(1); |
| 425 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | preserve_fds.insert(2); |
| 426 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_watchdog_->GetReadFd()); |
| 427 |
1/2✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
|
49 | preserve_fds.insert(pipe_listener_->GetWriteFd()); |
| 428 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | CloseAllFildes(preserve_fds); |
| 429 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogMicroSyslog(usyslog_save); // no-op if usyslog not used |
| 430 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetLogDebugFile(debuglog_save); // no-op if debug log not used |
| 431 | |||
| 432 |
2/4✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
|
49 | if (WaitForSupervisee()) |
| 433 | ✗ | Supervise(); | |
| 434 | |||
| 435 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_watchdog_->CloseReadFd(); |
| 436 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_listener_->CloseWriteFd(); |
| 437 | 49 | exit(0); | |
| 438 | } | ||
| 439 | ✗ | default: | |
| 440 | ✗ | _exit(0); | |
| 441 | } | ||
| 442 | 49 | default: | |
| 443 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_watchdog_->CloseReadFd(); |
| 444 |
1/2✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
|
49 | pipe_listener_->CloseWriteFd(); |
| 445 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseWriteFd(); |
| 446 |
2/4✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 49 times.
|
49 | if (waitpid(pid, &statloc, 0) != pid) |
| 447 | ✗ | PANIC(NULL); | |
| 448 |
2/4✓ Branch 0 taken 49 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 49 times.
|
49 | if (!WIFEXITED(statloc) || WEXITSTATUS(statloc)) |
| 449 | ✗ | PANIC(NULL); | |
| 450 | } | ||
| 451 | |||
| 452 | // retrieve the watchdog PID from the pipe | ||
| 453 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.Read(&watchdog_pid_); |
| 454 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | pipe_pid.CloseReadFd(); |
| 455 | 49 | } | |
| 456 | |||
| 457 | |||
| 458 | 49 | bool Watchdog::WaitForSupervisee() { | |
| 459 | // We want broken pipes not to raise a signal but handle the error in the | ||
| 460 | // read/write code | ||
| 461 | 49 | platform_sighandler_t rv_sig = signal(SIGPIPE, SIG_IGN); | |
| 462 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(rv_sig != SIG_ERR); |
| 463 | |||
| 464 | // The watchdog is not supposed to receive signals. If it does, report it. | ||
| 465 | struct sigaction sa; | ||
| 466 | 49 | memset(&sa, 0, sizeof(sa)); | |
| 467 | 49 | sa.sa_sigaction = ReportSignalAndContinue; | |
| 468 | 49 | sa.sa_flags = SA_SIGINFO; | |
| 469 | 49 | sigfillset(&sa.sa_mask); | |
| 470 | |||
| 471 | 49 | SigactionMap signal_handlers; | |
| 472 |
2/2✓ Branch 0 taken 637 times.
✓ Branch 1 taken 49 times.
|
686 | for (size_t i = 0; i < sizeof(g_suppressed_signals) / sizeof(int); i++) { |
| 473 |
1/2✓ Branch 1 taken 637 times.
✗ Branch 2 not taken.
|
637 | signal_handlers[g_suppressed_signals[i]] = sa; |
| 474 | } | ||
| 475 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | SetSignalHandlers(signal_handlers); |
| 476 | |||
| 477 | 49 | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
| 478 | |||
| 479 |
3/4✓ Branch 2 taken 49 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 1 times.
|
49 | if (!pipe_watchdog_->TryRead(&control_flow)) { |
| 480 |
1/2✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
|
48 | LogCvmfs(kLogMonitor, kLogDebug, "supervisee canceled watchdog"); |
| 481 | 48 | return false; | |
| 482 | } | ||
| 483 | |||
| 484 |
1/3✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
|
1 | switch (control_flow) { |
| 485 | 1 | case ControlFlow::kQuit: | |
| 486 | 1 | return false; | |
| 487 | ✗ | case ControlFlow::kSupervise: | |
| 488 | ✗ | break; | |
| 489 | ✗ | default: | |
| 490 | ✗ | LogEmergency("Internal error: invalid control flow"); | |
| 491 | ✗ | return false; | |
| 492 | } | ||
| 493 | |||
| 494 | size_t size; | ||
| 495 | ✗ | pipe_watchdog_->Read(&size); | |
| 496 | ✗ | crash_dump_path_.resize(size); | |
| 497 | ✗ | if (size > 0) { | |
| 498 | ✗ | pipe_watchdog_->Read(&crash_dump_path_[0], size); | |
| 499 | |||
| 500 | ✗ | const int retval = chdir(GetParentPath(crash_dump_path_).c_str()); | |
| 501 | ✗ | if (retval != 0) { | |
| 502 | ✗ | LogEmergency(std::string("Cannot change to crash dump directory: ") | |
| 503 | ✗ | + crash_dump_path_); | |
| 504 | ✗ | return false; | |
| 505 | } | ||
| 506 | ✗ | crash_dump_path_ = GetFileName(crash_dump_path_); | |
| 507 | } | ||
| 508 | ✗ | return true; | |
| 509 | 49 | } | |
| 510 | |||
| 511 | /** | ||
| 512 | * Set up the signal handling and kick off the supervision. | ||
| 513 | */ | ||
| 514 | ✗ | void Watchdog::Spawn(const std::string &crash_dump_path) { | |
| 515 | // lower restrictions for ptrace | ||
| 516 | ✗ | if (!platform_allow_ptrace(watchdog_pid_)) { | |
| 517 | ✗ | LogCvmfs(kLogMonitor, kLogSyslogWarn, | |
| 518 | "failed to allow ptrace() for watchdog (PID: %d). " | ||
| 519 | "Post crash stacktrace might not work", | ||
| 520 | watchdog_pid_); | ||
| 521 | } | ||
| 522 | |||
| 523 | // Extra stack for signal handlers | ||
| 524 | ✗ | const int stack_size = kSignalHandlerStacksize; // 2 MB | |
| 525 | ✗ | sighandler_stack_.ss_sp = smalloc(stack_size); | |
| 526 | ✗ | sighandler_stack_.ss_size = stack_size; | |
| 527 | ✗ | sighandler_stack_.ss_flags = 0; | |
| 528 | ✗ | if (sigaltstack(&sighandler_stack_, NULL) != 0) | |
| 529 | ✗ | PANIC(NULL); | |
| 530 | |||
| 531 | // define our crash signal handler | ||
| 532 | struct sigaction sa; | ||
| 533 | ✗ | memset(&sa, 0, sizeof(sa)); | |
| 534 | ✗ | sa.sa_sigaction = SendTrace; | |
| 535 | ✗ | sa.sa_flags = SA_SIGINFO | SA_ONSTACK; | |
| 536 | ✗ | sigfillset(&sa.sa_mask); | |
| 537 | |||
| 538 | ✗ | SigactionMap signal_handlers; | |
| 539 | ✗ | for (size_t i = 0; i < sizeof(g_crash_signals) / sizeof(int); i++) { | |
| 540 | ✗ | signal_handlers[g_crash_signals[i]] = sa; | |
| 541 | } | ||
| 542 | ✗ | old_signal_handlers_ = SetSignalHandlers(signal_handlers); | |
| 543 | |||
| 544 | ✗ | pipe_terminate_ = new Pipe<kPipeThreadTerminator>(); | |
| 545 | ✗ | const int retval = pthread_create(&thread_listener_, NULL, | |
| 546 | MainWatchdogListener, this); | ||
| 547 | ✗ | assert(retval == 0); | |
| 548 | |||
| 549 | ✗ | pipe_watchdog_->Write(ControlFlow::kSupervise); | |
| 550 | ✗ | const size_t path_size = crash_dump_path.size(); | |
| 551 | ✗ | pipe_watchdog_->Write(path_size); | |
| 552 | ✗ | if (path_size > 0) { | |
| 553 | ✗ | pipe_watchdog_->Write(crash_dump_path.data(), path_size); | |
| 554 | } | ||
| 555 | |||
| 556 | ✗ | spawned_ = true; | |
| 557 | } | ||
| 558 | |||
| 559 | |||
| 560 | ✗ | void *Watchdog::MainWatchdogListener(void *data) { | |
| 561 | ✗ | Watchdog *watchdog = static_cast<Watchdog *>(data); | |
| 562 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "starting watchdog listener"); | |
| 563 | |||
| 564 | struct pollfd watch_fds[2]; | ||
| 565 | ✗ | watch_fds[0].fd = watchdog->pipe_listener_->GetReadFd(); | |
| 566 | ✗ | watch_fds[0].events = 0; // Only check for POLL[ERR,HUP,NVAL] in revents | |
| 567 | ✗ | watch_fds[0].revents = 0; | |
| 568 | ✗ | watch_fds[1].fd = watchdog->pipe_terminate_->GetReadFd(); | |
| 569 | ✗ | watch_fds[1].events = POLLIN | POLLPRI; | |
| 570 | ✗ | watch_fds[1].revents = 0; | |
| 571 | while (true) { | ||
| 572 | ✗ | const int retval = poll(watch_fds, 2, -1); | |
| 573 | ✗ | if (retval < 0) { | |
| 574 | ✗ | continue; | |
| 575 | } | ||
| 576 | |||
| 577 | // Terminate I/O thread | ||
| 578 | ✗ | if (watch_fds[1].revents) | |
| 579 | ✗ | break; | |
| 580 | |||
| 581 | ✗ | if (watch_fds[0].revents) { | |
| 582 | ✗ | if ((watch_fds[0].revents & POLLERR) || (watch_fds[0].revents & POLLHUP) | |
| 583 | ✗ | || (watch_fds[0].revents & POLLNVAL)) { | |
| 584 | ✗ | LogCvmfs(kLogMonitor, kLogDebug | kLogSyslogErr, | |
| 585 | "watchdog disappeared, disabling stack trace reporting " | ||
| 586 | "(revents: %d / %d|%d|%d)", | ||
| 587 | ✗ | watch_fds[0].revents, POLLERR, POLLHUP, POLLNVAL); | |
| 588 | ✗ | watchdog->SetSignalHandlers(watchdog->old_signal_handlers_); | |
| 589 | ✗ | PANIC(kLogDebug | kLogSyslogErr, "watchdog disappeared, aborting"); | |
| 590 | } | ||
| 591 | ✗ | PANIC(NULL); | |
| 592 | } | ||
| 593 | } | ||
| 594 | |||
| 595 | ✗ | LogCvmfs(kLogMonitor, kLogDebug, "stopping watchdog listener"); | |
| 596 | ✗ | return NULL; | |
| 597 | } | ||
| 598 | |||
| 599 | |||
| 600 | ✗ | void Watchdog::Supervise() { | |
| 601 | ✗ | ControlFlow::Flags control_flow = ControlFlow::kUnknown; | |
| 602 | |||
| 603 | ✗ | if (!pipe_watchdog_->TryRead<ControlFlow::Flags>(&control_flow)) { | |
| 604 | ✗ | LogEmergency("watchdog: unexpected termination (" | |
| 605 | ✗ | + StringifyInt(control_flow) + ")"); | |
| 606 | ✗ | if (on_crash_) | |
| 607 | ✗ | on_crash_(); | |
| 608 | } else { | ||
| 609 | ✗ | switch (control_flow) { | |
| 610 | ✗ | case ControlFlow::kProduceStacktrace: | |
| 611 | ✗ | LogEmergency(ReportStacktrace()); | |
| 612 | ✗ | if (on_crash_) | |
| 613 | ✗ | on_crash_(); | |
| 614 | ✗ | break; | |
| 615 | |||
| 616 | ✗ | case ControlFlow::kQuit: | |
| 617 | ✗ | break; | |
| 618 | |||
| 619 | ✗ | default: | |
| 620 | ✗ | LogEmergency("watchdog: unexpected error"); | |
| 621 | ✗ | break; | |
| 622 | } | ||
| 623 | } | ||
| 624 | } | ||
| 625 | |||
| 626 | |||
| 627 | 49 | Watchdog::Watchdog(FnOnCrash on_crash) | |
| 628 | 49 | : spawned_(false) | |
| 629 |
1/2✓ Branch 1 taken 49 times.
✗ Branch 2 not taken.
|
49 | , exe_path_(string(platform_getexepath())) |
| 630 | 49 | , watchdog_pid_(0) | |
| 631 |
3/6✓ Branch 3 taken 49 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 49 times.
✗ Branch 7 not taken.
✓ Branch 9 taken 49 times.
✗ Branch 10 not taken.
|
98 | , on_crash_(on_crash) { |
| 632 | 49 | const int retval = platform_spinlock_init(&lock_handler_, 0); | |
| 633 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 49 times.
|
49 | assert(retval == 0); |
| 634 | 49 | memset(&sighandler_stack_, 0, sizeof(sighandler_stack_)); | |
| 635 | 49 | } | |
| 636 | |||
| 637 | |||
| 638 | 1 | Watchdog::~Watchdog() { | |
| 639 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (spawned_) { |
| 640 | // Reset signal handlers | ||
| 641 | ✗ | signal(SIGQUIT, SIG_DFL); | |
| 642 | ✗ | signal(SIGILL, SIG_DFL); | |
| 643 | ✗ | signal(SIGABRT, SIG_DFL); | |
| 644 | ✗ | signal(SIGFPE, SIG_DFL); | |
| 645 | ✗ | signal(SIGSEGV, SIG_DFL); | |
| 646 | ✗ | signal(SIGBUS, SIG_DFL); | |
| 647 | ✗ | signal(SIGPIPE, SIG_DFL); | |
| 648 | ✗ | signal(SIGXFSZ, SIG_DFL); | |
| 649 | ✗ | free(sighandler_stack_.ss_sp); | |
| 650 | ✗ | sighandler_stack_.ss_size = 0; | |
| 651 | |||
| 652 | ✗ | pipe_terminate_->Write(ControlFlow::kQuit); | |
| 653 | ✗ | pthread_join(thread_listener_, NULL); | |
| 654 | ✗ | pipe_terminate_->Close(); | |
| 655 | } | ||
| 656 | |||
| 657 | 1 | pipe_watchdog_->Write(ControlFlow::kQuit); | |
| 658 | 1 | pipe_watchdog_->CloseWriteFd(); | |
| 659 | 1 | pipe_listener_->CloseReadFd(); | |
| 660 | |||
| 661 | 1 | platform_spinlock_destroy(&lock_handler_); | |
| 662 | 1 | LogCvmfs(kLogMonitor, kLogDebug, "monitor stopped"); | |
| 663 | 1 | instance_ = NULL; | |
| 664 | 1 | } | |
| 665 |