1 /*
2 * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This software is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19 #include <crm/common/util.h>
20 #include "sbd.h"
21 #define LOCKSTRLEN 11
22
23 static struct servants_list_item *servants_leader = NULL;
24
25 int disk_priority = 1;
26 int check_pcmk = 1;
27 int check_cluster = 1;
28 int has_check_pcmk_env = false;
29 int disk_count = 0;
30 int servant_count = 0;
31 int servant_restart_interval = 5;
32 int servant_restart_count = 1;
33 int start_mode = 0;
34 char* pidfile = NULL;
35 bool do_flush = true;
36 char timeout_sysrq_char = 'b';
37 bool move_to_root_cgroup = true;
38 bool enforce_moving_to_root_cgroup = false;
39 bool sync_resource_startup = false;
40
41 int parse_device_line(const char *line);
42
43 static int
44 sanitize_numeric_option_value(const char *value)
45 {
46 char *end = NULL;
47 long int result = -1;
48
49 if (value == NULL) {
50 return -1;
51 }
52
53 errno = 0;
54
55 result = strtol(value, &end, 10);
56 if (result <= INT_MIN || result >= INT_MAX || errno != 0) {
57 result = -1;
58 } else if (*end != '\0') {
59 result = -1;
60 }
61
62 return (int)result;
63 }
64
65 static const char *
66 sanitize_option_value(const char *value)
67 {
68 size_t max = 0;
69 size_t lpc = 0;
70
71 if (value == NULL) {
72 return NULL;
73 }
74
75 max = strlen(value);
76
77 for (lpc = 0; lpc < max; lpc++) {
78 if (!isspace(value[lpc])) {
79 break;
80 }
81 }
82
83 return (strlen(value + lpc) > 0 ? (value + lpc) : NULL);
84 }
85
86 static const char *
87 get_env_option(const char *option)
88 {
89 const char *value = getenv(option);
90
91 return sanitize_option_value(value);
92 }
93
94 static int
95 recruit_servant(const char *devname, pid_t pid)
96 {
97 struct servants_list_item *s = servants_leader;
98 struct servants_list_item *newbie;
99
100 if (lookup_servant_by_dev(devname)) {
101 cl_log(LOG_DEBUG, "Servant %s already exists", devname);
102 return 0;
103 }
104
105 newbie = malloc(sizeof(*newbie));
106 if (newbie) {
107 memset(newbie, 0, sizeof(*newbie));
108 newbie->devname = strdup(devname);
109 newbie->pid = pid;
110 newbie->first_start = 1;
111 }
112 if (!newbie || !newbie->devname) {
113 fprintf(stderr, "heap allocation failed in recruit_servant.\n");
114 exit(1);
115 }
116
117 /* some sanity-check on our newbie */
118 if (sbd_is_disk(newbie)) {
119 cl_log(LOG_INFO, "Monitoring %s", devname);
120 disk_count++;
121 } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) {
122 /* alive just after pcmk and cluster servants have shown up */
123 newbie->outdated = 1;
124 } else {
125 /* toss our newbie */
126 cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname);
127 free((void *) newbie->devname);
128 free(newbie);
129 return -1;
130 }
131
132 if (!s) {
133 servants_leader = newbie;
134 } else {
135 while (s->next)
136 s = s->next;
137 s->next = newbie;
138 }
139
140 servant_count++;
141
142 return 0;
143 }
144
145 int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
146 {
147 pid_t pid = 0;
148 int rc = 0;
149
150 pid = fork();
151 if (pid == 0) { /* child */
152 maximize_priority();
153 sbd_set_format_string(QB_LOG_SYSLOG, devname);
154 rc = (*functionp)(devname, mode, argp);
155 if (rc == -1)
156 exit(1);
157 else
158 exit(0);
159 } else if (pid != -1) { /* parent */
160 return pid;
161 } else {
162 cl_log(LOG_ERR,"Failed to fork servant");
163 exit(1);
164 }
165 }
166
167 struct servants_list_item *lookup_servant_by_dev(const char *devname)
168 {
169 struct servants_list_item *s;
170
171 for (s = servants_leader; s; s = s->next) {
172 if (strcasecmp(s->devname, devname) == 0)
173 break;
174 }
175 return s;
176 }
177
178 struct servants_list_item *lookup_servant_by_pid(pid_t pid)
179 {
180 struct servants_list_item *s;
181
182 for (s = servants_leader; s; s = s->next) {
183 if (s->pid == pid)
184 break;
185 }
186 return s;
187 }
188
189 int check_all_dead(void)
190 {
191 struct servants_list_item *s;
192 int r = 0;
193
194 for (s = servants_leader; s; s = s->next) {
195 if (s->pid != 0) {
196 r = sigqueue_zero(s->pid, 0);
197 if (r == -1 && errno == ESRCH)
198 continue;
199 return 0;
200 }
201 }
202 return 1;
203 }
204
205 void servant_start(struct servants_list_item *s)
206 {
207 int r = 0;
208
209 if (s->pid != 0) {
210 r = sigqueue_zero(s->pid, 0);
211 if ((r != -1 || errno != ESRCH))
212 return;
213 }
214 s->restarts++;
215 if (sbd_is_disk(s)) {
216 #if SUPPORT_SHARED_DISK
217 DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
218 s->pid = assign_servant(s->devname, servant_md, start_mode, s);
219 #else
220 cl_log(LOG_ERR, "Shared disk functionality not supported");
221 return;
222 #endif
223 } else if(sbd_is_pcmk(s)) {
224 DBGLOG(LOG_INFO, "Starting Pacemaker servant");
225 s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
226
227 } else if(sbd_is_cluster(s)) {
228 DBGLOG(LOG_INFO, "Starting Cluster servant");
229 s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
230
231 } else {
232 cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
233 }
234
235 clock_gettime(CLOCK_MONOTONIC, &s->t_started);
236 return;
237 }
238
239 void servants_start(void)
240 {
241 struct servants_list_item *s;
242
243 for (s = servants_leader; s; s = s->next) {
244 s->restarts = 0;
245 servant_start(s);
246 }
247 }
248
249 void servants_kill(void)
250 {
251 struct servants_list_item *s;
252
253 for (s = servants_leader; s; s = s->next) {
254 if (s->pid != 0) {
255 sigqueue_zero(s->pid, SIGKILL);
256 }
257 }
258 }
259
260 static inline void cleanup_servant_by_pid(pid_t pid)
261 {
262 struct servants_list_item* s;
263
264 s = lookup_servant_by_pid(pid);
265 if (s) {
266 cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
267 s->devname, s->pid);
268 s->pid = 0;
269 } else {
270 /* This most likely is a stray signal from somewhere, or
271 * a SIGCHLD for a process that has previously
272 * explicitly disconnected. */
273 DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
274 pid);
275 }
276 }
277
278 int inquisitor_decouple(void)
279 {
280 pid_t ppid = getppid();
281
282 /* During start-up, we only arm the watchdog once we've got
283 * quorum at least once. */
284 if (watchdog_use) {
285 if (watchdog_init() < 0) {
286 return -1;
287 }
288 }
289
290 if (ppid > 1) {
291 sigqueue_zero(ppid, SIG_LIVENESS);
292 }
293 return 0;
294 }
295
296 static int sbd_lock_running(long pid)
297 {
298 int rc = 0;
299 long mypid;
300 int running = 0;
301 char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
302
303 /* check if pid is running */
304 if (kill(pid, 0) < 0 && errno == ESRCH) {
305 goto bail;
306 }
307
308 #ifndef HAVE_PROC_PID
309 return 1;
310 #endif
311
312 /* check to make sure pid hasn't been reused by another process */
313 snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
314 rc = readlink(proc_path, exe_path, PATH_MAX-1);
315 if(rc < 0) {
316 cl_perror("Could not read from %s", proc_path);
317 goto bail;
318 }
319 exe_path[rc] = 0;
320 mypid = (unsigned long) getpid();
321 snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
322 rc = readlink(proc_path, myexe_path, PATH_MAX-1);
323 if(rc < 0) {
324 cl_perror("Could not read from %s", proc_path);
325 goto bail;
326 }
327 myexe_path[rc] = 0;
328
329 if(strcmp(exe_path, myexe_path) == 0) {
330 running = 1;
331 }
332
333 bail:
334 return running;
335 }
336
337 static int
338 sbd_lock_pidfile(const char *filename)
339 {
340 char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
341 int fd;
342 long pid, mypid;
343 int rc;
344 struct stat sbuf;
345
|
(1) Event cond_false: |
Condition "filename == NULL", taking false branch. |
346 if (filename == NULL) {
347 errno = EFAULT;
348 return -1;
|
(2) Event if_end: |
End of if statement. |
349 }
350
351 mypid = (unsigned long) getpid();
352 snprintf(lf_name, sizeof(lf_name), "%s",filename);
353 snprintf(tf_name, sizeof(tf_name), "%s.%lu",
354 filename, mypid);
355
|
(3) Event cond_true: |
Condition "(fd = open(lf_name, 0)) >= 0", taking true branch. |
356 if ((fd = open(lf_name, O_RDONLY)) >= 0) {
|
(4) Event cond_true: |
Condition "fstat(fd, &sbuf) >= 0", taking true branch. |
|
(5) Event cond_true: |
Condition "sbuf.st_size < 11", taking true branch. |
357 if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
358 sleep(1); /* if someone was about to create one,
359 * give'm a sec to do so
360 * Though if they follow our protocol,
361 * this won't happen. They should really
362 * put the pid in, then link, not the
363 * other way around.
364 */
365 }
|
(6) Event cond_true: |
Condition "read(fd, buf, 12UL /* sizeof (buf) */) < 1", taking true branch. |
366 if (read(fd, buf, sizeof(buf)) < 1) {
367 /* lockfile empty -> rm it and go on */;
|
(7) Event if_fallthrough: |
Falling through to end of if statement. |
368 } else {
369 if (sscanf(buf, "%ld", &pid) < 1) {
370 /* lockfile screwed up -> rm it and go on */
371 } else {
372 if (pid > 1 && (getpid() != pid)
373 && sbd_lock_running(pid)) {
374 /* is locked by existing process
375 * -> give up */
376 close(fd);
377 return -1;
378 } else {
379 /* stale lockfile -> rm it and go on */
380 }
381 }
|
(8) Event if_end: |
End of if statement. |
382 }
383 unlink(lf_name);
384 close(fd);
385 }
|
(9) Event cond_false: |
Condition "(fd = open(tf_name, 193 /* (0x40 | 1) | 0x80 */, 420)) < 0", taking false branch. |
386 if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
387 /* Hmmh, why did we fail? Anyway, nothing we can do about it */
388 return -3;
|
(10) Event if_end: |
End of if statement. |
389 }
390
391 /* Slight overkill with the %*d format ;-) */
392 snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
393
|
(11) Event cond_false: |
Condition "write(fd, buf, 11) != 11", taking false branch. |
394 if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
395 /* Again, nothing we can do about this */
396 rc = -3;
397 close(fd);
398 goto out;
|
(12) Event if_end: |
End of if statement. |
399 }
400 close(fd);
401
|
(13) Event switch: |
Switch case value "0". |
402 switch (link(tf_name, lf_name)) {
|
(14) Event switch_case: |
Reached case "0". |
403 case 0:
|
(15) Event fs_check_call: |
Calling function "stat" to perform check on "tf_name". |
|
(16) Event cond_true: |
Condition "stat(tf_name, &sbuf) < 0", taking true branch. |
| Also see events: |
[toctou] |
404 if (stat(tf_name, &sbuf) < 0) {
405 /* something weird happened */
406 rc = -3;
|
(17) Event break: |
Breaking from switch. |
407 break;
408 }
409 if (sbuf.st_nlink < 2) {
410 /* somehow, it didn't get through - NFS trouble? */
411 rc = -2;
412 break;
413 }
414 rc = 0;
415 break;
416 case EEXIST:
417 rc = -1;
418 break;
419 default:
420 rc = -3;
|
(18) Event switch_end: |
Reached end of switch. |
421 }
422 out:
|
(19) Event toctou: |
Calling function "unlink" that uses "tf_name" after a check function. This can cause a time-of-check, time-of-use race condition. |
| Also see events: |
[fs_check_call] |
423 unlink(tf_name);
424 return rc;
425 }
426
427
428 /*
429 * Unlock a file (remove its lockfile)
430 * do we need to check, if its (still) ours? No, IMHO, if someone else
431 * locked our line, it's his fault -tho
432 * returns 0 on success
433 * <0 if some failure occured
434 */
435
436 static int
437 sbd_unlock_pidfile(const char *filename)
438 {
439 char lf_name[256];
440
441 if (filename == NULL) {
442 errno = EFAULT;
443 return -1;
444 }
445
446 snprintf(lf_name, sizeof(lf_name), "%s", filename);
447
448 return unlink(lf_name);
449 }
450
451 int cluster_alive(bool all)
452 {
453 int alive = 1;
454 struct servants_list_item* s;
455
456 if(servant_count == disk_count) {
457 return 0;
458 }
459
460 for (s = servants_leader; s; s = s->next) {
461 if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
462 if(s->outdated) {
463 alive = 0;
464 } else if(all == false) {
465 return 1;
466 }
467 }
468 }
469
470 return alive;
471 }
472
473 int quorum_read(int good_servants)
474 {
475 if (disk_count > 2)
476 return (good_servants > disk_count/2);
477 else
478 return (good_servants > 0);
479 }
480
481 void inquisitor_child(void)
482 {
483 int sig, pid;
484 sigset_t procmask;
485 siginfo_t sinfo;
486 int status;
487 struct timespec timeout;
488 int exiting = 0;
489 int decoupled = 0;
490 int cluster_appeared = 0;
491 int pcmk_override = 0;
492 int latency;
493 struct timespec t_last_tickle, t_now;
494 struct servants_list_item* s;
495
496 if (debug_mode) {
497 cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
498 }
499
500 set_proc_title("sbd: inquisitor");
501
502 if (pidfile) {
503 if (sbd_lock_pidfile(pidfile) < 0) {
504 exit(1);
505 }
506 }
507
508 sigemptyset(&procmask);
509 sigaddset(&procmask, SIGCHLD);
510 sigaddset(&procmask, SIGTERM);
511 sigaddset(&procmask, SIG_LIVENESS);
512 sigaddset(&procmask, SIG_EXITREQ);
513 sigaddset(&procmask, SIG_TEST);
514 sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
515 sigaddset(&procmask, SIG_RESTART);
516 sigaddset(&procmask, SIGUSR1);
517 sigaddset(&procmask, SIGUSR2);
518 sigprocmask(SIG_BLOCK, &procmask, NULL);
519
520 servants_start();
521
522 timeout.tv_sec = timeout_loop;
523 timeout.tv_nsec = 0;
524 clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
525
526 while (1) {
527 bool tickle = 0;
528 bool can_detach = 0;
529 int good_servants = 0;
530
531 sig = sigtimedwait(&procmask, &sinfo, &timeout);
532
533 clock_gettime(CLOCK_MONOTONIC, &t_now);
534
535 if (sig == SIG_EXITREQ || sig == SIGTERM) {
536 servants_kill();
537 watchdog_close(true);
538 exiting = 1;
539 } else if (sig == SIGCHLD) {
540 while ((pid = waitpid(-1, &status, WNOHANG))) {
541 if (pid == -1 && errno == ECHILD) {
542 break;
543 } else {
544 s = lookup_servant_by_pid(pid);
545 if (sbd_is_disk(s)) {
546 if (WIFEXITED(status)) {
547 switch(WEXITSTATUS(status)) {
548 case EXIT_MD_SERVANT_IO_FAIL:
549 DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
550 s->devname);
551 break;
552 case EXIT_MD_SERVANT_REQUEST_RESET:
553 cl_log(LOG_WARNING, "%s requested a reset", s->devname);
554 do_reset();
555 break;
556 case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
557 cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
558 do_off();
559 break;
560 case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
561 cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
562 do_crashdump();
563 break;
564 default:
565 break;
566 }
567 }
568 } else if (sbd_is_pcmk(s)) {
569 if (WIFEXITED(status)) {
570 switch(WEXITSTATUS(status)) {
571 case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
572 DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
573 /* revert to state prior to pacemaker-detection */
574 s->restarts = 0;
575 s->restart_blocked = 0;
576 cluster_appeared = 0;
577 s->outdated = 1;
578 s->t_last.tv_sec = 0;
579 break;
580 default:
581 break;
582 }
583 }
584 }
585 cleanup_servant_by_pid(pid);
586 }
587 }
588 } else if (sig == SIG_PCMK_UNHEALTHY) {
589 s = lookup_servant_by_pid(sinfo.si_pid);
590 if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
591 if (s->outdated == 0) {
592 cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
593 }
594 s->t_last.tv_sec = 1;
595 } else {
596 cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
597 }
598 } else if (sig == SIG_LIVENESS) {
599 s = lookup_servant_by_pid(sinfo.si_pid);
600 if (s) {
601 s->first_start = 0;
602 clock_gettime(CLOCK_MONOTONIC, &s->t_last);
603 }
604
605 } else if (sig == SIG_TEST) {
606 } else if (sig == SIGUSR1) {
607 if (exiting)
608 continue;
609 servants_start();
610 }
611
612 if (exiting) {
613 if (check_all_dead()) {
614 if (pidfile) {
615 sbd_unlock_pidfile(pidfile);
616 }
617 exit(0);
618 } else
619 continue;
620 }
621
622 good_servants = 0;
623 for (s = servants_leader; s; s = s->next) {
624 int age = seconds_diff_timespec(&t_now, &(s->t_last));
625
626 if (!s->t_last.tv_sec)
627 continue;
628
629 if (age < timeout_io+timeout_loop) {
630 if (sbd_is_disk(s)) {
631 good_servants++;
632 }
633 if (s->outdated) {
634 cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
635 }
636 s->outdated = 0;
637
638 } else if (!s->outdated) {
639 if (!s->restart_blocked) {
640 cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
641 }
642 s->outdated = 1;
643 }
644 }
645
646 if(disk_count == 0) {
647 /* NO disks, everything is up to the cluster */
648
649 if(cluster_alive(true)) {
650 /* We LIVE! */
651 if(cluster_appeared == false) {
652 cl_log(LOG_INFO, "Active cluster detected");
653 }
654 tickle = 1;
655 can_detach = 1;
656 cluster_appeared = 1;
657
658 } else if(cluster_alive(false)) {
659 if(!decoupled) {
660 /* On the way up, detach and arm the watchdog */
661 cl_log(LOG_INFO, "Partial cluster detected, detaching");
662 }
663
664 can_detach = 1;
665 tickle = !cluster_appeared;
666
667 } else if(!decoupled) {
668 /* Stay alive until the cluster comes up */
669 tickle = !cluster_appeared;
670 }
671
672 } else if(disk_priority == 1 || servant_count == disk_count) {
673 if (quorum_read(good_servants)) {
674 /* There are disks and we're connected to the majority of them */
675 tickle = 1;
676 can_detach = 1;
677 pcmk_override = 0;
678
679 } else if (servant_count > disk_count && cluster_alive(true)) {
680 tickle = 1;
681
682 if(!pcmk_override) {
683 cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
684 pcmk_override = 1; /* Only log this message once */
685 }
686 }
687
688 } else if(cluster_alive(true) && quorum_read(good_servants)) {
689 /* Both disk and cluster servants are healthy */
690 tickle = 1;
691 can_detach = 1;
692 cluster_appeared = 1;
693
694 } else if(quorum_read(good_servants)) {
695 /* The cluster takes priority but only once
696 * connected for the first time.
697 *
698 * Until then, we tickle based on disk quorum.
699 */
700 can_detach = 1;
701 tickle = !cluster_appeared;
702 }
703
704 /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
705 /* quorum_read(good_servants), good_servants, tickle, disk_count); */
706
707 if(tickle) {
708 watchdog_tickle();
709 clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
710 }
711
712 if (!decoupled && can_detach) {
713 /* We only do this at the point either the disk or
714 * cluster servants become healthy
715 */
716 cl_log(LOG_DEBUG, "Decoupling");
717 if (inquisitor_decouple() < 0) {
718 servants_kill();
719 exiting = 1;
720 continue;
721 } else {
722 decoupled = 1;
723 }
724 }
725
726 /* Note that this can actually be negative, since we set
727 * last_tickle after we set now. */
728 latency = seconds_diff_timespec(&t_now, &t_last_tickle);
729 if (timeout_watchdog && (latency > timeout_watchdog)) {
730 if (!decoupled) {
731 /* We're still being watched by our
732 * parent. We don't fence, but exit. */
733 cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
734 servants_kill();
735 exiting = 1;
736 continue;
737 }
738 if (debug_mode < 2) {
739 /* At level 2 or above, we do nothing, but expect
740 * things to eventually return to
741 * normal. */
742 do_timeout_action();
743 } else {
744 cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
745 }
746 }
747
748 if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
749 cl_log(LOG_WARNING,
750 "Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)",
751 latency, timeout_watchdog_warn, good_servants);
752
753 if (debug_mode && watchdog_use) {
754 /* In debug mode, trigger a reset before the watchdog can panic the machine */
755 do_timeout_action();
756 }
757 }
758
759 for (s = servants_leader; s; s = s->next) {
760 int age = seconds_diff_timespec(&t_now, &(s->t_started));
761
762 if (age > servant_restart_interval) {
763 s->restarts = 0;
764 s->restart_blocked = 0;
765 }
766
767 if (servant_restart_count
768 && (s->restarts >= servant_restart_count)
769 && !s->restart_blocked) {
770 if (servant_restart_count > 1) {
771 cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
772 (int)servant_restart_count, s->devname);
773 }
774 s->restart_blocked = 1;
775 }
776
777 if (!s->restart_blocked) {
778 servant_start(s);
779 }
780 }
781 }
782 /* not reached */
783 exit(0);
784 }
785
786 int inquisitor(void)
787 {
788 int sig, pid, inquisitor_pid;
789 int status;
790 sigset_t procmask;
791 siginfo_t sinfo;
792
793 /* Where's the best place for sysrq init ?*/
794 sysrq_init();
795
796 sigemptyset(&procmask);
797 sigaddset(&procmask, SIGCHLD);
798 sigaddset(&procmask, SIG_LIVENESS);
799 sigprocmask(SIG_BLOCK, &procmask, NULL);
800
801 inquisitor_pid = make_daemon();
802 if (inquisitor_pid == 0) {
803 inquisitor_child();
804 }
805
806 /* We're the parent. Wait for a happy signal from our child
807 * before we proceed - we either get "SIG_LIVENESS" when the
808 * inquisitor has completed the first successful round, or
809 * ECHLD when it exits with an error. */
810
811 while (1) {
812 sig = sigwaitinfo(&procmask, &sinfo);
813 if (sig == SIGCHLD) {
814 while ((pid = waitpid(-1, &status, WNOHANG))) {
815 if (pid == -1 && errno == ECHILD) {
816 break;
817 }
818 /* We got here because the inquisitor
819 * did not succeed. */
820 return -1;
821 }
822 } else if (sig == SIG_LIVENESS) {
823 /* Inquisitor started up properly. */
824 return 0;
825 } else {
826 fprintf(stderr, "Nobody expected the spanish inquisition!\n");
827 continue;
828 }
829 }
830 /* not reached */
831 return -1;
832 }
833
834
835 int
836 parse_device_line(const char *line)
837 {
838 size_t lpc = 0;
839 size_t last = 0;
840 size_t max = 0;
841 int found = 0;
842 bool skip_space = true;
843 int space_run = 0;
844
845 if (!line) {
846 return 0;
847 }
848
849 max = strlen(line);
850
851 cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line);
852
853 for (lpc = 0; lpc <= max; lpc++) {
854 if (isspace(line[lpc])) {
855 if (skip_space) {
856 last = lpc + 1;
857 } else {
858 space_run++;
859 }
860 continue;
861 }
862 skip_space = false;
863 if (line[lpc] == ';' || line[lpc] == 0) {
864 int rc = 0;
865 char *entry = calloc(1, 1 + lpc - last);
866
867 if (entry) {
868 rc = sscanf(line + last, "%[^;]", entry);
869 } else {
870 fprintf(stderr, "Heap allocation failed parsing device-line.\n");
871 exit(1);
872 }
873
874 if (rc != 1) {
875 cl_log(LOG_WARNING, "Could not parse: '%s'", line + last);
876 } else {
877 entry[strlen(entry)-space_run] = '\0';
878 cl_log(LOG_DEBUG, "Adding '%s'", entry);
879 if (recruit_servant(entry, 0) != 0) {
880 free(entry);
881 // sbd should refuse to start if any of the configured device names is invalid.
882 return -1;
883 }
884 found++;
885 }
886
887 free(entry);
888 skip_space = true;
889 last = lpc + 1;
890 }
891 space_run = 0;
892 }
893 return found;
894 }
895
896 #define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,sbd-watchdog.c,setproctitle.c"
897
898 static void
899 sbd_log_filter_ctl(const char *files, uint8_t priority)
900 {
901 if (files == NULL) {
902 files = SBD_SOURCE_FILES;
903 }
904
905 qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
906 qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
907 }
908
909 int
910 arg_enabled(int arg_count)
911 {
912 return arg_count % 2;
913 }
914
915 int main(int argc, char **argv, char **envp)
916 {
917 int exit_status = 0;
918 int c;
919 int W_count = 0;
920 int c_count = 0;
921 int P_count = 0;
922 int qb_facility;
923 const char *value = NULL;
924 bool delay_start = false;
925 long delay = 0;
926 char *timeout_action = NULL;
927
928 if ((cmdname = strrchr(argv[0], '/')) == NULL) {
929 cmdname = argv[0];
930 } else {
931 ++cmdname;
932 }
933
934 watchdogdev = strdup("/dev/watchdog");
935 watchdogdev_is_default = true;
936 qb_facility = qb_log_facility2int("daemon");
937 qb_log_init(cmdname, qb_facility, LOG_WARNING);
938 sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
939
940 qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
941 qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
942 sbd_log_filter_ctl(NULL, LOG_NOTICE);
943
944 sbd_get_uname();
945
946 value = get_env_option("SBD_PACEMAKER");
947 if(value) {
948 check_pcmk = crm_is_true(value);
949 check_cluster = crm_is_true(value);
950
951 has_check_pcmk_env = true;
952 }
953 cl_log(LOG_INFO, "SBD_PACEMAKER set to: %d (%s)", (int)check_pcmk, value?value:"default");
954
955 value = get_env_option("SBD_STARTMODE");
956 if(value == NULL) {
957 } else if(strcmp(value, "clean") == 0) {
958 start_mode = 1;
959 } else if(strcmp(value, "always") == 0) {
960 start_mode = 0;
961 }
962 cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
963
964 value = get_env_option("SBD_WATCHDOG_DEV");
965 if(value) {
966 free(watchdogdev);
967 watchdogdev = strdup(value);
968 watchdogdev_is_default = false;
969 }
970
971 /* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
972 * This is for backward compatibility. */
973 value = get_env_option("SBD_WATCHDOG");
974 if(value) {
975 watchdog_use = crm_is_true(value);
976 }
977
978 value = get_env_option("SBD_WATCHDOG_TIMEOUT");
979 if(value) {
980 timeout_watchdog = crm_get_msec(value) / 1000;
981 }
982
983 value = get_env_option("SBD_PIDFILE");
984 if(value) {
985 pidfile = strdup(value);
986 cl_log(LOG_INFO, "pidfile set to %s", pidfile);
987 }
988
989 value = get_env_option("SBD_DELAY_START");
990 if(value) {
991 if (crm_str_to_boolean(value, (int *) &delay_start) != 1) {
992 delay = crm_get_msec(value) / 1000;
993 if (delay > 0) {
994 delay_start = true;
995 }
996 }
997 }
998
999 value = get_env_option("SBD_TIMEOUT_ACTION");
1000 if(value) {
1001 timeout_action = strdup(value);
1002 }
1003
1004 value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP");
1005 if(value) {
1006 move_to_root_cgroup = crm_is_true(value);
1007
1008 if (move_to_root_cgroup) {
1009 enforce_moving_to_root_cgroup = true;
1010 } else {
1011 if (strcmp(value, "auto") == 0) {
1012 move_to_root_cgroup = true;
1013 }
1014 }
1015 }
1016
1017 while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
1018 int sanitized_num_optarg = 0;
1019 /* Call it before checking optarg for NULL to make coverity happy */
1020 const char *sanitized_optarg = sanitize_option_value(optarg);
1021
1022 if (optarg && ((sanitized_optarg == NULL) ||
1023 (strchr("SsC12345tIF", c) &&
1024 (sanitized_num_optarg = sanitize_numeric_option_value(sanitized_optarg)) < 0))) {
1025 fprintf(stderr, "Invalid value \"%s\" for option -%c\n", optarg, c);
1026 exit_status = -2;
1027 goto out;
1028 }
1029
1030 switch (c) {
1031 case 'D':
1032 break;
1033 case 'Z':
1034 debug_mode++;
1035 cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
1036 break;
1037 case 'R':
1038 skip_rt = 1;
1039 cl_log(LOG_INFO, "Realtime mode deactivated.");
1040 break;
1041 case 'S':
1042 start_mode = sanitized_num_optarg;
1043 cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
1044 break;
1045 case 's':
1046 timeout_startup = sanitized_num_optarg;
1047 cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
1048 break;
1049 case 'v':
1050 debug++;
1051 if(debug == 1) {
1052 sbd_log_filter_ctl(NULL, LOG_INFO);
1053 cl_log(LOG_INFO, "Verbose mode enabled.");
1054
1055 } else if(debug == 2) {
1056 sbd_log_filter_ctl(NULL, LOG_DEBUG);
1057 cl_log(LOG_INFO, "Debug mode enabled.");
1058
1059 } else if(debug == 3) {
1060 /* Go nuts, turn on pacemaker's logging too */
1061 sbd_log_filter_ctl("*", LOG_DEBUG);
1062 cl_log(LOG_INFO, "Debug library mode enabled.");
1063 }
1064 break;
1065 case 'T':
1066 watchdog_set_timeout = 0;
1067 cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
1068 break;
1069 case 'W':
1070 W_count++;
1071 break;
1072 case 'w':
1073 free(watchdogdev);
1074 watchdogdev = strdup(sanitized_optarg);
1075 watchdogdev_is_default = false;
1076 cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
1077 break;
1078 case 'd':
1079 #if SUPPORT_SHARED_DISK
1080 if (recruit_servant(sanitized_optarg, 0) != 0) {
1081 fprintf(stderr, "Invalid device: %s\n", optarg);
1082 exit_status = -1;
1083 goto out;
1084 }
1085 #else
1086 fprintf(stderr, "Shared disk functionality not supported\n");
1087 exit_status = -2;
1088 goto out;
1089 #endif
1090 break;
1091 case 'c':
1092 c_count++;
1093 break;
1094 case 'P':
1095 P_count++;
1096 break;
1097 case 'z':
1098 disk_priority = 0;
1099 break;
1100 case 'n':
1101 local_uname = strdup(sanitized_optarg);
1102 cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
1103 break;
1104 case 'p':
1105 pidfile = strdup(sanitized_optarg);
1106 cl_log(LOG_INFO, "pidfile set to %s", pidfile);
1107 break;
1108 case 'C':
1109 timeout_watchdog_crashdump = sanitized_num_optarg;
1110 cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
1111 timeout_watchdog_crashdump);
1112 break;
1113 case '1':
1114 timeout_watchdog = sanitized_num_optarg;
1115 break;
1116 case '2':
1117 timeout_allocate = sanitized_num_optarg;
1118 break;
1119 case '3':
1120 timeout_loop = sanitized_num_optarg;
1121 break;
1122 case '4':
1123 timeout_msgwait = sanitized_num_optarg;
1124 break;
1125 case '5':
1126 timeout_watchdog_warn = sanitized_num_optarg;
1127 do_calculate_timeout_watchdog_warn = false;
1128 cl_log(LOG_INFO, "Setting latency warning to %d",
1129 timeout_watchdog_warn);
1130 break;
1131 case 't':
1132 servant_restart_interval = sanitized_num_optarg;
1133 cl_log(LOG_INFO, "Setting servant restart interval to %d",
1134 (int)servant_restart_interval);
1135 break;
1136 case 'I':
1137 timeout_io = sanitized_num_optarg;
1138 cl_log(LOG_INFO, "Setting IO timeout to %d",
1139 (int)timeout_io);
1140 break;
1141 case 'F':
1142 servant_restart_count = sanitized_num_optarg;
1143 cl_log(LOG_INFO, "Servant restart count set to %d",
1144 (int)servant_restart_count);
1145 break;
1146 case 'r':
1147 if (timeout_action) {
1148 free(timeout_action);
1149 }
1150 timeout_action = strdup(sanitized_optarg);
1151 break;
1152 case 'h':
1153 usage();
1154 goto out;
1155 break;
1156 default:
1157 exit_status = -2;
1158 goto out;
1159 break;
1160 }
1161 }
1162
1163 if (disk_count == 0) {
1164 /* if we already have disks from commandline
1165 then it is probably undesirable to add those
1166 from environment (general rule cmdline has precedence)
1167 */
1168 value = get_env_option("SBD_DEVICE");
1169 if ((value) && strlen(value)) {
1170 #if SUPPORT_SHARED_DISK
1171 int devices = parse_device_line(value);
1172 if(devices < 1) {
1173 fprintf(stderr, "Invalid device line: %s\n", value);
1174 exit_status = -1;
1175 goto out;
1176 }
1177 #else
1178 fprintf(stderr, "Shared disk functionality not supported\n");
1179 exit_status = -2;
1180 goto out;
1181 #endif
1182 }
1183 }
1184
1185 if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
1186 watchdog_use = 0;
1187
1188 } else if (W_count > 0) {
1189 watchdog_use = arg_enabled(W_count);
1190 }
1191
1192 if (watchdog_use) {
1193 cl_log(LOG_INFO, "Watchdog enabled.");
1194 } else {
1195 cl_log(LOG_INFO, "Watchdog disabled.");
1196 }
1197
1198 if (c_count > 0) {
1199 check_cluster = arg_enabled(c_count);
1200 }
1201
1202 if (P_count > 0) {
1203 int check_pcmk_arg = arg_enabled(P_count);
1204
1205 if (has_check_pcmk_env && check_pcmk_arg != check_pcmk) {
1206 cl_log(LOG_WARNING, "Pacemaker integration is %s: "
1207 "SBD_PACEMAKER=%s is overridden by %s option. "
1208 "It's recommended to only use SBD_PACEMAKER.",
1209 check_pcmk_arg? "enabled" : "disabled",
1210 check_pcmk? "yes" : "no",
1211 check_pcmk_arg? "-P" : "-PP");
1212 }
1213 check_pcmk = check_pcmk_arg;
1214 }
1215
1216 if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
1217 fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
1218 SECTOR_NAME_MAX);
1219 fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
1220 exit_status = -1;
1221 goto out;
1222 }
1223
1224 if (disk_count > 3) {
1225 fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
1226 exit_status = -1;
1227 goto out;
1228 }
1229
1230 /* There must at least be one command following the options: */
1231 if ((argc - optind) < 1) {
1232 fprintf(stderr, "Not enough arguments.\n");
1233 exit_status = -2;
1234 goto out;
1235 }
1236
1237 if (init_set_proc_title(argc, argv, envp) < 0) {
1238 fprintf(stderr, "Allocation of proc title failed.\n");
1239 exit_status = -1;
1240 goto out;
1241 }
1242
1243 if (timeout_action) {
1244 char *p[2];
1245 int i;
1246 char c;
1247 int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c);
1248 bool parse_error = (nrflags < 1) || (nrflags > 2);
1249
1250 for (i = 0; (i < nrflags) && (i < 2); i++) {
1251 if (!strcmp(p[i], "reboot")) {
1252 timeout_sysrq_char = 'b';
1253 } else if (!strcmp(p[i], "crashdump")) {
1254 timeout_sysrq_char = 'c';
1255 } else if (!strcmp(p[i], "off")) {
1256 timeout_sysrq_char = 'o';
1257 } else if (!strcmp(p[i], "flush")) {
1258 do_flush = true;
1259 } else if (!strcmp(p[i], "noflush")) {
1260 do_flush = false;
1261 } else {
1262 parse_error = true;
1263 }
1264 free(p[i]);
1265 }
1266 if (parse_error) {
1267 fprintf(stderr, "Failed to parse timeout-action \"%s\".\n",
1268 timeout_action);
1269 exit_status = -1;
1270 goto out;
1271 }
1272 }
1273
1274 if (strcmp(argv[optind], "watch") == 0) {
1275 value = get_env_option("SBD_SYNC_RESOURCE_STARTUP");
1276 sync_resource_startup =
1277 crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT);
1278
1279 #if !USE_PACEMAKERD_API
1280 if (sync_resource_startup) {
1281 fprintf(stderr, "Failed to sync resource-startup as "
1282 "SBD was built against pacemaker not supporting pacemakerd-API.\n");
1283 exit_status = -1;
1284 goto out;
1285 }
1286 #else
1287 if (check_pcmk && !sync_resource_startup) {
1288 cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
1289 "pacemakerd-API. Should think about enabling "
1290 "SBD_SYNC_RESOURCE_STARTUP.");
1291
1292 } else if (!check_pcmk && sync_resource_startup) {
1293 fprintf(stderr, "Set SBD_PACEMAKER=yes to allow resource startup syncing. "
1294 "Otherwise explicitly set SBD_SYNC_RESOURCE_STARTUP=no if to intentionally disable.\n");
1295 exit_status = -1;
1296 goto out;
1297 }
1298 #endif
1299 }
1300
1301 #if SUPPORT_SHARED_DISK
1302 if (strcmp(argv[optind], "create") == 0) {
1303 exit_status = init_devices(servants_leader);
1304
1305 } else if (strcmp(argv[optind], "dump") == 0) {
1306 exit_status = dump_headers(servants_leader);
1307
1308 } else if (strcmp(argv[optind], "allocate") == 0) {
1309 exit_status = allocate_slots(argv[optind + 1], servants_leader);
1310
1311 } else if (strcmp(argv[optind], "list") == 0) {
1312 exit_status = list_slots(servants_leader);
1313
1314 } else if (strcmp(argv[optind], "message") == 0) {
1315 exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
1316
1317 } else if (strcmp(argv[optind], "ping") == 0) {
1318 exit_status = ping_via_slots(argv[optind + 1], servants_leader);
1319
1320 } else
1321 #endif
1322 if (strcmp(argv[optind], "query-watchdog") == 0) {
1323 exit_status = watchdog_info();
1324 } else if (strcmp(argv[optind], "test-watchdog") == 0) {
1325 exit_status = watchdog_test();
1326 } else if (strcmp(argv[optind], "watch") == 0) {
1327 /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
1328
1329 const char *delay_source = delay ? "SBD_DELAY_START" : "";
1330
1331 #if SUPPORT_SHARED_DISK
1332 if(disk_count > 0) {
1333 /* If no devices are specified, its not an error to be unable to find one */
1334 open_any_device(servants_leader);
1335
1336 if (delay_start && delay <= 0) {
1337 delay = get_first_msgwait(servants_leader);
1338
1339 if (delay > 0) {
1340 delay_source = "msgwait";
1341 } else {
1342 cl_log(LOG_WARNING, "No 'msgwait' value from disk, using '2 * watchdog-timeout' for 'delay' starting");
1343 }
1344 }
1345 }
1346 #endif
1347 /* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from:
1348 * SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */
1349 if (do_calculate_timeout_watchdog_warn) {
1350 timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog);
1351 }
1352
1353 if (delay_start) {
1354 /* diskless mode or disk read issues causing get_first_msgwait() to return a 0 for delay */
1355 if (delay <= 0) {
1356 delay = 2 * timeout_watchdog;
1357 delay_source = "watchdog-timeout * 2";
1358 }
1359
1360 cl_log(LOG_DEBUG, "Delay start (yes), (delay: %ld), (delay source: %s)", delay, delay_source);
1361
1362 sleep((unsigned long) delay);
1363
1364 } else {
1365 cl_log(LOG_DEBUG, "Delay start (no)");
1366 }
1367
1368 /* We only want this to have an effect during watch right now;
1369 * pinging and fencing would be too confused */
1370 cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
1371 if (check_pcmk) {
1372 recruit_servant("pcmk", 0);
1373 #if SUPPORT_PLUGIN
1374 check_cluster = 1;
1375 #endif
1376 }
1377
1378 cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
1379 if (check_cluster) {
1380 recruit_servant("cluster", 0);
1381 }
1382
1383 cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout",
1384 do_flush?"Do":"Skip", timeout_sysrq_char);
1385 exit_status = inquisitor();
1386 } else {
1387 exit_status = -2;
1388 }
1389
1390 out:
1391 if (timeout_action) {
1392 free(timeout_action);
1393 }
1394 if (exit_status < 0) {
1395 if (exit_status == -2) {
1396 usage();
1397 } else {
1398 fprintf(stderr, "sbd failed; please check the logs.\n");
1399 }
1400 return (1);
1401 }
1402 return (0);
1403 }
1404