1    	/*
2    	 * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
3    	 *
4    	 * This program is free software; you can redistribute it and/or
5    	 * modify it under the terms of the GNU General Public
6    	 * License as published by the Free Software Foundation; either
7    	 * version 2 of the License, or (at your option) any later version.
8    	 *
9    	 * This software is distributed in the hope that it will be useful,
10   	 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   	 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   	 * General Public License for more details.
13   	 *
14   	 * You should have received a copy of the GNU General Public License along
15   	 * with this program; if not, write to the Free Software Foundation, Inc.,
16   	 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17   	 */
18   	
19   	#include <crm/common/util.h>
20   	#include "sbd.h"
21   	#define	LOCKSTRLEN	11
22   	
23   	static struct servants_list_item *servants_leader = NULL;
24   	
25   	int     disk_priority = 1;
26   	int	check_pcmk = 1;
27   	int	check_cluster = 1;
28   	int	has_check_pcmk_env = false;
29   	int	disk_count	= 0;
30   	int	servant_count	= 0;
31   	int	servant_restart_interval = 5;
32   	int	servant_restart_count = 1;
33   	int	start_mode = 0;
34   	char*	pidfile = NULL;
35   	bool do_flush = true;
36   	char timeout_sysrq_char = 'b';
37   	bool move_to_root_cgroup = true;
38   	bool enforce_moving_to_root_cgroup = false;
39   	bool sync_resource_startup = false;
40   	
41   	int parse_device_line(const char *line);
42   	
43   	static int
44   	sanitize_numeric_option_value(const char *value)
45   	{
46   	    char *end = NULL;
47   	    long int result = -1;
48   	
49   	    if (value == NULL) {
50   	        return -1;
51   	    }
52   	
53   	    errno = 0;
54   	
55   	    result = strtol(value, &end, 10);
56   	    if (result <= INT_MIN || result >= INT_MAX || errno != 0) {
57   	        result = -1;
58   	    } else if (*end != '\0') {
59   	        result = -1;
60   	    }
61   	
62   	    return (int)result;
63   	}
64   	
65   	static const char *
66   	sanitize_option_value(const char *value)
67   	{
68   		size_t max = 0;
69   		size_t lpc = 0;
70   	
71   		if (value == NULL) {
72   			return NULL;
73   		}
74   	
75   		max = strlen(value);
76   	
77   		for (lpc = 0; lpc < max; lpc++) {
78   			if (!isspace(value[lpc])) {
79   				break;
80   			}
81   		}
82   	
83   		return (strlen(value + lpc) > 0 ? (value + lpc) : NULL);
84   	}
85   	
86   	static const char *
87   	get_env_option(const char *option)
88   	{
89   		const char *value = getenv(option);
90   	
91   		return sanitize_option_value(value);
92   	}
93   	
94   	static int
95   	recruit_servant(const char *devname, pid_t pid)
96   	{
97   		struct servants_list_item *s = servants_leader;
98   		struct servants_list_item *newbie;
99   	
100  		if (lookup_servant_by_dev(devname)) {
101  		    cl_log(LOG_DEBUG, "Servant %s already exists", devname);
102  		    return 0;
103  		}
104  	
105  		newbie = malloc(sizeof(*newbie));
106  		if (newbie) {
107  		    memset(newbie, 0, sizeof(*newbie));
108  		    newbie->devname = strdup(devname);
109  		    newbie->pid = pid;
110  		    newbie->first_start = 1;
111  		}
112  		if (!newbie || !newbie->devname) {
113  		    fprintf(stderr, "heap allocation failed in recruit_servant.\n");
114  		    exit(1);
115  		}
116  	
117  		/* some sanity-check on our newbie */
118  		if (sbd_is_disk(newbie)) {
119  		    cl_log(LOG_INFO, "Monitoring %s", devname);
120  		    disk_count++;
121  		} else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) {
122  		    /* alive just after pcmk and cluster servants have shown up */
123  		    newbie->outdated = 1;
124  		} else {
125  		    /* toss our newbie */
126  		    cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname);
127  		    free((void *) newbie->devname);
128  		    free(newbie);
129  		    return -1;
130  		}
131  	
132  		if (!s) {
133  			servants_leader = newbie;
134  		} else {
135  			while (s->next)
136  				s = s->next;
137  			s->next = newbie;
138  		}
139  	
140  		servant_count++;
141  	
142  		return 0;
143  	}
144  	
145  	int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
146  	{
147  		pid_t pid = 0;
148  		int rc = 0;
149  	
150  		pid = fork();
151  		if (pid == 0) {		/* child */
152  			maximize_priority();
153  	                sbd_set_format_string(QB_LOG_SYSLOG, devname);
154  			rc = (*functionp)(devname, mode, argp);
155  			if (rc == -1)
156  				exit(1);
157  			else
158  				exit(0);
159  		} else if (pid != -1) {		/* parent */
160  			return pid;
161  		} else {
162  			cl_log(LOG_ERR,"Failed to fork servant");
163  			exit(1);
164  		}
165  	}
166  	
167  	struct servants_list_item *lookup_servant_by_dev(const char *devname)
168  	{
169  		struct servants_list_item *s;
170  	
171  		for (s = servants_leader; s; s = s->next) {
172  			if (strcasecmp(s->devname, devname) == 0)
173  				break;
174  		}
175  		return s;
176  	}
177  	
178  	struct servants_list_item *lookup_servant_by_pid(pid_t pid)
179  	{
180  		struct servants_list_item *s;
181  	
182  		for (s = servants_leader; s; s = s->next) {
183  			if (s->pid == pid)
184  				break;
185  		}
186  		return s;
187  	}
188  	
189  	int check_all_dead(void)
190  	{
191  		struct servants_list_item *s;
192  		int r = 0;
193  	
194  		for (s = servants_leader; s; s = s->next) {
195  			if (s->pid != 0) {
196  				r = sigqueue_zero(s->pid, 0);
197  				if (r == -1 && errno == ESRCH)
198  					continue;
199  				return 0;
200  			}
201  		}
202  		return 1;
203  	}
204  	
205  	void servant_start(struct servants_list_item *s)
206  	{
207  		int r = 0;
208  	
209  		if (s->pid != 0) {
210  			r = sigqueue_zero(s->pid, 0);
211  			if ((r != -1 || errno != ESRCH))
212  				return;
213  		}
214  		s->restarts++;
215  		if (sbd_is_disk(s)) {
216  	#if SUPPORT_SHARED_DISK
217  			DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
218  			s->pid = assign_servant(s->devname, servant_md, start_mode, s);
219  	#else
220  	                cl_log(LOG_ERR, "Shared disk functionality not supported");
221  	                return;
222  	#endif
223  		} else if(sbd_is_pcmk(s)) {
224  			DBGLOG(LOG_INFO, "Starting Pacemaker servant");
225  			s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL);
226  	
227  		} else if(sbd_is_cluster(s)) {
228  			DBGLOG(LOG_INFO, "Starting Cluster servant");
229  			s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL);
230  	
231  	        } else {
232  	            cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname);
233  	        }        
234  	
235  		clock_gettime(CLOCK_MONOTONIC, &s->t_started);
236  		return;
237  	}
238  	
239  	void servants_start(void)
240  	{
241  		struct servants_list_item *s;
242  	
243  		for (s = servants_leader; s; s = s->next) {
244  			s->restarts = 0;
245  			servant_start(s);
246  		}
247  	}
248  	
249  	void servants_kill(void)
250  	{
251  		struct servants_list_item *s;
252  	
253  		for (s = servants_leader; s; s = s->next) {
254  			if (s->pid != 0) {
255  				sigqueue_zero(s->pid, SIGKILL);
256  			}
257  		}
258  	}
259  	
260  	static inline void cleanup_servant_by_pid(pid_t pid)
261  	{
262  		struct servants_list_item* s;
263  	
264  		s = lookup_servant_by_pid(pid);
265  		if (s) {
266  			cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated",
267  					s->devname, s->pid);
268  			s->pid = 0;
269  		} else {
270  			/* This most likely is a stray signal from somewhere, or
271  			 * a SIGCHLD for a process that has previously
272  			 * explicitly disconnected. */
273  			DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i",
274  					pid);
275  		}
276  	}
277  	
278  	int inquisitor_decouple(void)
279  	{
280  		pid_t ppid = getppid();
281  	
282  		/* During start-up, we only arm the watchdog once we've got
283  		 * quorum at least once. */
284  		if (watchdog_use) {
285  			if (watchdog_init() < 0) {
286  				return -1;
287  			}
288  		}
289  	
290  		if (ppid > 1) {
291  			sigqueue_zero(ppid, SIG_LIVENESS);
292  		}
293  		return 0;
294  	}
295  	
296  	static int sbd_lock_running(long pid)
297  	{
298  		int rc = 0;
299  		long mypid;
300  		int running = 0;
301  		char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX];
302  	
303  		/* check if pid is running */
304  		if (kill(pid, 0) < 0 && errno == ESRCH) {
305  			goto bail;
306  		}
307  	
308  	#ifndef HAVE_PROC_PID
309  		return 1;
310  	#endif
311  	
312  		/* check to make sure pid hasn't been reused by another process */
313  		snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid);
314  		rc = readlink(proc_path, exe_path, PATH_MAX-1);
315  		if(rc < 0) {
316  			cl_perror("Could not read from %s", proc_path);
317  			goto bail;
318  		}
319  		exe_path[rc] = 0;
320  		mypid = (unsigned long) getpid();
321  		snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid);
322  		rc = readlink(proc_path, myexe_path, PATH_MAX-1);
323  		if(rc < 0) {
324  			cl_perror("Could not read from %s", proc_path);
325  			goto bail;
326  		}
327  		myexe_path[rc] = 0;
328  	
329  		if(strcmp(exe_path, myexe_path) == 0) {
330  			running = 1;
331  		}
332  	
333  	  bail:
334  		return running;
335  	}
336  	
337  	static int
338  	sbd_lock_pidfile(const char *filename)
339  	{
340  		char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1];
341  		int fd;
342  		long	pid, mypid;
343  		int rc;
344  		struct stat sbuf;
345  	
(1) Event cond_false: Condition "filename == NULL", taking false branch.
346  		if (filename == NULL) {
347  			errno = EFAULT;
348  			return -1;
(2) Event if_end: End of if statement.
349  		}
350  	
351  		mypid = (unsigned long) getpid();
352  		snprintf(lf_name, sizeof(lf_name), "%s",filename);
353  		snprintf(tf_name, sizeof(tf_name), "%s.%lu",
354  			 filename, mypid);
355  	
(3) Event cond_true: Condition "(fd = open(lf_name, 0)) >= 0", taking true branch.
356  		if ((fd = open(lf_name, O_RDONLY)) >= 0) {
(4) Event cond_true: Condition "fstat(fd, &sbuf) >= 0", taking true branch.
(5) Event cond_true: Condition "sbuf.st_size < 11", taking true branch.
357  			if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) {
358  				sleep(1); /* if someone was about to create one,
359  				   	   * give'm a sec to do so
360  					   * Though if they follow our protocol,
361  					   * this won't happen.  They should really
362  					   * put the pid in, then link, not the
363  					   * other way around.
364  					   */
365  			}
(6) Event cond_true: Condition "read(fd, buf, 12UL /* sizeof (buf) */) < 1", taking true branch.
366  			if (read(fd, buf, sizeof(buf)) < 1) {
367  				/* lockfile empty -> rm it and go on */;
(7) Event if_fallthrough: Falling through to end of if statement.
368  			} else {
369  				if (sscanf(buf, "%ld", &pid) < 1) {
370  					/* lockfile screwed up -> rm it and go on */
371  				} else {
372  					if (pid > 1 && (getpid() != pid)
373  					&&	sbd_lock_running(pid)) {
374  						/* is locked by existing process
375  						 * -> give up */
376  						close(fd);
377  						return -1;
378  					} else {
379  						/* stale lockfile -> rm it and go on */
380  					}
381  				}
(8) Event if_end: End of if statement.
382  			}
383  			unlink(lf_name);
384  			close(fd);
385  		}
(9) Event cond_false: Condition "(fd = open(tf_name, 193 /* (0x40 | 1) | 0x80 */, 420)) < 0", taking false branch.
386  		if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) {
387  			/* Hmmh, why did we fail? Anyway, nothing we can do about it */
388  			return -3;
(10) Event if_end: End of if statement.
389  		}
390  	
391  		/* Slight overkill with the %*d format ;-) */
392  		snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid);
393  	
(11) Event cond_false: Condition "write(fd, buf, 11) != 11", taking false branch.
394  		if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) {
395  			/* Again, nothing we can do about this */
396  			rc = -3;
397  			close(fd);
398  			goto out;
(12) Event if_end: End of if statement.
399  		}
400  		close(fd);
401  	
(13) Event switch: Switch case value "0".
402  		switch (link(tf_name, lf_name)) {
(14) Event switch_case: Reached case "0".
403  		case 0:
(15) Event fs_check_call: Calling function "stat" to perform check on "tf_name".
(16) Event cond_true: Condition "stat(tf_name, &sbuf) < 0", taking true branch.
Also see events: [toctou]
404  			if (stat(tf_name, &sbuf) < 0) {
405  				/* something weird happened */
406  				rc = -3;
(17) Event break: Breaking from switch.
407  				break;
408  			}
409  			if (sbuf.st_nlink < 2) {
410  				/* somehow, it didn't get through - NFS trouble? */
411  				rc = -2;
412  				break;
413  			}
414  			rc = 0;
415  			break;
416  		case EEXIST:
417  			rc = -1;
418  			break;
419  		default:
420  			rc = -3;
(18) Event switch_end: Reached end of switch.
421  		}
422  	 out:
(19) Event toctou: Calling function "unlink" that uses "tf_name" after a check function. This can cause a time-of-check, time-of-use race condition.
Also see events: [fs_check_call]
423  		unlink(tf_name);
424  		return rc;
425  	}
426  	
427  	
428  	/*
429  	 * Unlock a file (remove its lockfile) 
430  	 * do we need to check, if its (still) ours? No, IMHO, if someone else
431  	 * locked our line, it's his fault  -tho
432  	 * returns 0 on success
433  	 * <0 if some failure occured
434  	 */
435  	
436  	static int
437  	sbd_unlock_pidfile(const char *filename)
438  	{
439  		char lf_name[256];
440  	
441  		if (filename == NULL) {
442  			errno = EFAULT;
443  			return -1;
444  		}
445  	
446  		snprintf(lf_name, sizeof(lf_name), "%s", filename);
447  	
448  		return unlink(lf_name);
449  	}
450  	
451  	int cluster_alive(bool all)
452  	{
453  	    int alive = 1;
454  	    struct servants_list_item* s;
455  	
456  	    if(servant_count == disk_count) {
457  	        return 0;
458  	    }
459  	
460  	    for (s = servants_leader; s; s = s->next) {
461  	        if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
462  	            if(s->outdated) {
463  	                alive = 0;
464  	            } else if(all == false) {
465  	                return 1;
466  	            }
467  	        }
468  	    }
469  	
470  	    return alive;
471  	}
472  	
473  	int quorum_read(int good_servants)
474  	{
475  		if (disk_count > 2) 
476  			return (good_servants > disk_count/2);
477  		else
478  			return (good_servants > 0);
479  	}
480  	
481  	void inquisitor_child(void)
482  	{
483  		int sig, pid;
484  		sigset_t procmask;
485  		siginfo_t sinfo;
486  		int status;
487  		struct timespec timeout;
488  		int exiting = 0;
489  		int decoupled = 0;
490  		int cluster_appeared = 0;
491  		int pcmk_override = 0;
492  		int latency;
493  		struct timespec t_last_tickle, t_now;
494  		struct servants_list_item* s;
495  	
496  		if (debug_mode) {
497  	            cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode);
498  		}
499  	
500  		set_proc_title("sbd: inquisitor");
501  	
502  		if (pidfile) {
503  			if (sbd_lock_pidfile(pidfile) < 0) {
504  				exit(1);
505  			}
506  		}
507  	
508  		sigemptyset(&procmask);
509  		sigaddset(&procmask, SIGCHLD);
510  		sigaddset(&procmask, SIGTERM);
511  		sigaddset(&procmask, SIG_LIVENESS);
512  		sigaddset(&procmask, SIG_EXITREQ);
513  		sigaddset(&procmask, SIG_TEST);
514  		sigaddset(&procmask, SIG_PCMK_UNHEALTHY);
515  		sigaddset(&procmask, SIG_RESTART);
516  		sigaddset(&procmask, SIGUSR1);
517  		sigaddset(&procmask, SIGUSR2);
518  		sigprocmask(SIG_BLOCK, &procmask, NULL);
519  	
520  		servants_start();
521  	
522  		timeout.tv_sec = timeout_loop;
523  		timeout.tv_nsec = 0;
524  		clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
525  	
526  		while (1) {
527  	                bool tickle = 0;
528  	                bool can_detach = 0;
529  			int good_servants = 0;
530  	
531  			sig = sigtimedwait(&procmask, &sinfo, &timeout);
532  	
533  			clock_gettime(CLOCK_MONOTONIC, &t_now);
534  	
535  			if (sig == SIG_EXITREQ || sig == SIGTERM) {
536  				servants_kill();
537  				watchdog_close(true);
538  				exiting = 1;
539  			} else if (sig == SIGCHLD) {
540  				while ((pid = waitpid(-1, &status, WNOHANG))) {
541  					if (pid == -1 && errno == ECHILD) {
542  						break;
543  					} else {
544  						s = lookup_servant_by_pid(pid);
545  						if (sbd_is_disk(s)) {
546  							if (WIFEXITED(status)) {
547  								switch(WEXITSTATUS(status)) {
548  									case EXIT_MD_SERVANT_IO_FAIL:
549  										DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
550  											s->devname);
551  										break;
552  									case EXIT_MD_SERVANT_REQUEST_RESET:
553  										cl_log(LOG_WARNING, "%s requested a reset", s->devname);
554  										do_reset();
555  										break;
556  									case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
557  										cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
558  										do_off();
559  										break;
560  									case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
561  										cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
562  										do_crashdump();
563  										break;
564  									default:
565  										break;
566  								}
567  							}
568  						} else if (sbd_is_pcmk(s)) {
569  							if (WIFEXITED(status)) {
570  								switch(WEXITSTATUS(status)) {
571  									case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
572  										DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
573  										/* revert to state prior to pacemaker-detection */
574  										s->restarts = 0;
575  										s->restart_blocked = 0;
576  										cluster_appeared = 0;
577  										s->outdated = 1;
578  										s->t_last.tv_sec = 0;
579  										break;
580  									default:
581  										break;
582  								}
583  							}
584  						}
585  						cleanup_servant_by_pid(pid);
586  					}
587  				}
588  			} else if (sig == SIG_PCMK_UNHEALTHY) {
589  				s = lookup_servant_by_pid(sinfo.si_pid);
590  				if (sbd_is_cluster(s) || sbd_is_pcmk(s)) {
591  	                if (s->outdated == 0) {
592  	                    cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname);
593  	                }
594  	                s->t_last.tv_sec = 1;
595  	            } else {
596  	                cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source");
597  	            }
598  			} else if (sig == SIG_LIVENESS) {
599  				s = lookup_servant_by_pid(sinfo.si_pid);
600  				if (s) {
601  					s->first_start = 0;
602  					clock_gettime(CLOCK_MONOTONIC, &s->t_last);
603  				}
604  	
605  			} else if (sig == SIG_TEST) {
606  			} else if (sig == SIGUSR1) {
607  				if (exiting)
608  					continue;
609  				servants_start();
610  			}
611  	
612  			if (exiting) {
613  				if (check_all_dead()) {
614  					if (pidfile) {
615  						sbd_unlock_pidfile(pidfile);
616  					}
617  					exit(0);
618  				} else
619  					continue;
620  			}
621  	
622  			good_servants = 0;
623  			for (s = servants_leader; s; s = s->next) {
624  				int age = seconds_diff_timespec(&t_now, &(s->t_last));
625  	
626  				if (!s->t_last.tv_sec)
627  					continue;
628  	
629  				if (age < timeout_io+timeout_loop) {
630  					if (sbd_is_disk(s)) {
631  	                                    good_servants++;
632  					}
633  	                                if (s->outdated) {
634  	                                    cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age);
635  					}
636  					s->outdated = 0;
637  	
638  				} else if (!s->outdated) {
639  	                                if (!s->restart_blocked) {
640  	                                    cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age);
641  					}
642  	                                s->outdated = 1;
643  				}
644  			}
645  	
646  	                if(disk_count == 0) {
647  	                    /* NO disks, everything is up to the cluster */
648  	                    
649  	                    if(cluster_alive(true)) {
650  	                        /* We LIVE! */
651  	                        if(cluster_appeared == false) {
652  	                            cl_log(LOG_INFO, "Active cluster detected");
653  	                        }
654  	                        tickle = 1;
655  	                        can_detach = 1;
656  	                        cluster_appeared = 1;
657  	
658  	                    } else if(cluster_alive(false)) {
659  	                        if(!decoupled) {
660  	                            /* On the way up, detach and arm the watchdog */
661  	                            cl_log(LOG_INFO, "Partial cluster detected, detaching");
662  	                        }
663  	
664  	                        can_detach = 1;
665  	                        tickle = !cluster_appeared;
666  	
667  	                    } else if(!decoupled) {
668  	                        /* Stay alive until the cluster comes up */
669  	                        tickle = !cluster_appeared;
670  	                    }
671  	
672  	                } else if(disk_priority == 1 || servant_count == disk_count) {
673  	                    if (quorum_read(good_servants)) {
674  	                        /* There are disks and we're connected to the majority of them */
675  	                        tickle = 1;
676  	                        can_detach = 1;
677  	                        pcmk_override = 0;
678  	
679  	                    } else if (servant_count > disk_count && cluster_alive(true)) {
680  	                        tickle = 1;
681  	                    
682  	                        if(!pcmk_override) {
683  	                            cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker");
684  	                            pcmk_override = 1; /* Only log this message once */
685  	                        }
686  	                    }
687  	
688  	                } else if(cluster_alive(true) && quorum_read(good_servants)) {
689  	                    /* Both disk and cluster servants are healthy */
690  	                    tickle = 1;
691  	                    can_detach = 1;
692  	                    cluster_appeared = 1;
693  	
694  	                } else if(quorum_read(good_servants)) {
695  	                    /* The cluster takes priority but only once
696  	                     * connected for the first time.
697  	                     *
698  	                     * Until then, we tickle based on disk quorum.
699  	                     */
700  	                    can_detach = 1;
701  	                    tickle = !cluster_appeared;
702  	                }
703  	
704  	                /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */
705  	                /*        quorum_read(good_servants), good_servants, tickle, disk_count); */
706  	
707  	                if(tickle) {
708  	                    watchdog_tickle();
709  	                    clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
710  	                }
711  	
712  	                if (!decoupled && can_detach) {
713  	                    /* We only do this at the point either the disk or
714  	                     * cluster servants become healthy
715  	                     */
716  	                    cl_log(LOG_DEBUG, "Decoupling");
717  	                    if (inquisitor_decouple() < 0) {
718  	                        servants_kill();
719  	                        exiting = 1;
720  	                        continue;
721  	                    } else {
722  	                        decoupled = 1;
723  	                    }
724  	                }
725  	
726  			/* Note that this can actually be negative, since we set
727  			 * last_tickle after we set now. */
728  			latency = seconds_diff_timespec(&t_now, &t_last_tickle);
729  			if (timeout_watchdog && (latency > timeout_watchdog)) {
730  				if (!decoupled) {
731  					/* We're still being watched by our
732  					 * parent. We don't fence, but exit. */
733  					cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
734  					servants_kill();
735  					exiting = 1;
736  					continue;
737  				}
738  				if (debug_mode < 2) {
739  					/* At level 2 or above, we do nothing, but expect
740  					 * things to eventually return to
741  					 * normal. */
742  					do_timeout_action();
743  				} else {
744  					cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
745  				}
746  			}
747  	
748  			if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
749  				cl_log(LOG_WARNING,
750  				       "Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)",
751  				       latency, timeout_watchdog_warn, good_servants);
752  	
753  	                        if (debug_mode && watchdog_use) {
754  	                            /* In debug mode, trigger a reset before the watchdog can panic the machine */
755  	                            do_timeout_action();
756  	                        }
757  			}
758  	
759  			for (s = servants_leader; s; s = s->next) {
760  				int age = seconds_diff_timespec(&t_now, &(s->t_started));
761  	
762  				if (age > servant_restart_interval) {
763  					s->restarts = 0;
764  					s->restart_blocked = 0;
765  				}
766  	
767  				if (servant_restart_count
768  						&& (s->restarts >= servant_restart_count)
769  						&& !s->restart_blocked) {
770  					if (servant_restart_count > 1) {
771  						cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s",
772  								(int)servant_restart_count, s->devname);
773  					}
774  					s->restart_blocked = 1;
775  				}
776  	
777  				if (!s->restart_blocked) {
778  					servant_start(s);
779  				}
780  			}
781  		}
782  		/* not reached */
783  		exit(0);
784  	}
785  	
786  	int inquisitor(void)
787  	{
788  		int sig, pid, inquisitor_pid;
789  		int status;
790  		sigset_t procmask;
791  		siginfo_t sinfo;
792  	
793  		/* Where's the best place for sysrq init ?*/
794  		sysrq_init();
795  	
796  		sigemptyset(&procmask);
797  		sigaddset(&procmask, SIGCHLD);
798  		sigaddset(&procmask, SIG_LIVENESS);
799  		sigprocmask(SIG_BLOCK, &procmask, NULL);
800  	
801  		inquisitor_pid = make_daemon();
802  		if (inquisitor_pid == 0) {
803  			inquisitor_child();
804  		} 
805  		
806  		/* We're the parent. Wait for a happy signal from our child
807  		 * before we proceed - we either get "SIG_LIVENESS" when the
808  		 * inquisitor has completed the first successful round, or
809  		 * ECHLD when it exits with an error. */
810  	
811  		while (1) {
812  			sig = sigwaitinfo(&procmask, &sinfo);
813  			if (sig == SIGCHLD) {
814  				while ((pid = waitpid(-1, &status, WNOHANG))) {
815  					if (pid == -1 && errno == ECHILD) {
816  						break;
817  					}
818  					/* We got here because the inquisitor
819  					 * did not succeed. */
820  					return -1;
821  				}
822  			} else if (sig == SIG_LIVENESS) {
823  				/* Inquisitor started up properly. */
824  				return 0;
825  			} else {
826  				fprintf(stderr, "Nobody expected the spanish inquisition!\n");
827  				continue;
828  			}
829  		}
830  		/* not reached */
831  		return -1;
832  	}
833  	
834  	
835  	int
836  	parse_device_line(const char *line)
837  	{
838  	    size_t lpc = 0;
839  	    size_t last = 0;
840  	    size_t max = 0;
841  	    int found = 0;
842  	    bool skip_space = true;
843  	    int space_run = 0;
844  	
845  	    if (!line) {
846  	        return 0;
847  	    }
848  	
849  	    max = strlen(line);
850  	
851  	    cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line);
852  	
853  	    for (lpc = 0; lpc <= max; lpc++) {
854  	        if (isspace(line[lpc])) {
855  	            if (skip_space) {
856  	                last = lpc + 1;
857  	            } else {
858  	                space_run++;
859  	            }
860  	            continue;
861  	        }
862  	        skip_space = false;
863  	        if (line[lpc] == ';' || line[lpc] == 0) {
864  	            int rc = 0;
865  	            char *entry = calloc(1, 1 + lpc - last);
866  	
867  	            if (entry) {
868  	                rc = sscanf(line + last, "%[^;]", entry);
869  	            } else {
870  	                fprintf(stderr, "Heap allocation failed parsing device-line.\n");
871  	                exit(1);
872  	            }
873  	
874  	            if (rc != 1) {
875  	                cl_log(LOG_WARNING, "Could not parse: '%s'", line + last);
876  	            } else {
877  	                entry[strlen(entry)-space_run] = '\0';
878  	                cl_log(LOG_DEBUG, "Adding '%s'", entry);
879  	                if (recruit_servant(entry, 0) != 0) {
880  	                    free(entry);
881  	                    // sbd should refuse to start if any of the configured device names is invalid.
882  	                    return -1;
883  	                }
884  	                found++;
885  	            }
886  	
887  	            free(entry);
888  	            skip_space = true;
889  	            last = lpc + 1;
890  	        }
891  	        space_run = 0;
892  	    }
893  	    return found;
894  	}
895  	
896  	#define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,sbd-watchdog.c,setproctitle.c"
897  	
898  	static void
899  	sbd_log_filter_ctl(const char *files, uint8_t priority)
900  	{
901  		if (files == NULL) {
902  			files = SBD_SOURCE_FILES;
903  		}
904  	
905  		qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
906  		qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority);
907  	}
908  	
909  	int
910  	arg_enabled(int arg_count)
911  	{
912  	    return arg_count % 2;
913  	}
914  	
915  	int main(int argc, char **argv, char **envp)
916  	{
917  		int exit_status = 0;
918  		int c;
919  		int W_count = 0;
920  		int c_count = 0;
921  		int P_count = 0;
922  	        int qb_facility;
923  	        const char *value = NULL;
924  	        bool delay_start = false;
925  	        long delay = 0;
926  	        char *timeout_action = NULL;
927  	
928  		if ((cmdname = strrchr(argv[0], '/')) == NULL) {
929  			cmdname = argv[0];
930  		} else {
931  			++cmdname;
932  		}
933  	
934  	        watchdogdev = strdup("/dev/watchdog");
935  	        watchdogdev_is_default = true;
936  	        qb_facility = qb_log_facility2int("daemon");
937  	        qb_log_init(cmdname, qb_facility, LOG_WARNING);
938  	        sbd_set_format_string(QB_LOG_SYSLOG, "sbd");
939  	
940  	        qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
941  	        qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE);
942  	        sbd_log_filter_ctl(NULL, LOG_NOTICE);
943  	
944  		sbd_get_uname();
945  	
946  	        value = get_env_option("SBD_PACEMAKER");
947  	        if(value) {
948  	            check_pcmk = crm_is_true(value);
949  	            check_cluster = crm_is_true(value);
950  	
951  	            has_check_pcmk_env = true;
952  	        }
953  	        cl_log(LOG_INFO, "SBD_PACEMAKER set to: %d (%s)", (int)check_pcmk, value?value:"default");
954  	
955  	        value = get_env_option("SBD_STARTMODE");
956  	        if(value == NULL) {
957  	        } else if(strcmp(value, "clean") == 0) {
958  	            start_mode = 1;
959  	        } else if(strcmp(value, "always") == 0) {
960  	            start_mode = 0;
961  	        }
962  	        cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default");
963  	
964  	        value = get_env_option("SBD_WATCHDOG_DEV");
965  	        if(value) {
966  	            free(watchdogdev);
967  	            watchdogdev = strdup(value);
968  	            watchdogdev_is_default = false;
969  	        }
970  	
971  	        /* SBD_WATCHDOG has been dropped from sbd.sysconfig example.
972  	         * This is for backward compatibility. */
973  	        value = get_env_option("SBD_WATCHDOG");
974  	        if(value) {
975  	            watchdog_use = crm_is_true(value);
976  	        }
977  	
978  	        value = get_env_option("SBD_WATCHDOG_TIMEOUT");
979  	        if(value) {
980  	            timeout_watchdog = crm_get_msec(value) / 1000;
981  	        }
982  	
983  	        value = get_env_option("SBD_PIDFILE");
984  	        if(value) {
985  	            pidfile = strdup(value);
986  	            cl_log(LOG_INFO, "pidfile set to %s", pidfile);
987  	        }
988  	
989  	        value = get_env_option("SBD_DELAY_START");
990  	        if(value) {
991  	            if (crm_str_to_boolean(value, (int *) &delay_start) != 1) {
992  	                delay = crm_get_msec(value) / 1000;
993  	                if (delay > 0) {
994  	                    delay_start = true;
995  	                }
996  	            }
997  	        }
998  	
999  	        value = get_env_option("SBD_TIMEOUT_ACTION");
1000 	        if(value) {
1001 	            timeout_action = strdup(value);
1002 	        }
1003 	
1004 	        value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP");
1005 	        if(value) {
1006 	            move_to_root_cgroup = crm_is_true(value);
1007 	
1008 	            if (move_to_root_cgroup) {
1009 	               enforce_moving_to_root_cgroup = true;
1010 	            } else {
1011 	                if (strcmp(value, "auto") == 0) {
1012 	                    move_to_root_cgroup = true;
1013 	                }
1014 	            }
1015 	        }
1016 	
1017 		while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
1018 			int sanitized_num_optarg = 0;
1019 			/* Call it before checking optarg for NULL to make coverity happy */
1020 			const char *sanitized_optarg = sanitize_option_value(optarg);
1021 	
1022 			if (optarg && ((sanitized_optarg == NULL) ||
1023 					(strchr("SsC12345tIF", c) &&
1024 					(sanitized_num_optarg = sanitize_numeric_option_value(sanitized_optarg)) < 0))) {
1025 				fprintf(stderr, "Invalid value \"%s\" for option -%c\n", optarg, c);
1026 				exit_status = -2;
1027 				goto out;
1028 			}
1029 	
1030 			switch (c) {
1031 			case 'D':
1032 				break;
1033 			case 'Z':
1034 				debug_mode++;
1035 				cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode);
1036 				break;
1037 			case 'R':
1038 				skip_rt = 1;
1039 				cl_log(LOG_INFO, "Realtime mode deactivated.");
1040 				break;
1041 			case 'S':
1042 				start_mode = sanitized_num_optarg;
1043 				cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode);
1044 				break;
1045 			case 's':
1046 				timeout_startup = sanitized_num_optarg;
1047 				cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup);
1048 				break;
1049 			case 'v':
1050 	                    debug++;
1051 	                    if(debug == 1) {
1052 	                        sbd_log_filter_ctl(NULL, LOG_INFO);
1053 	                        cl_log(LOG_INFO, "Verbose mode enabled.");
1054 	
1055 	                    } else if(debug == 2) {
1056 	                        sbd_log_filter_ctl(NULL, LOG_DEBUG);
1057 	                        cl_log(LOG_INFO, "Debug mode enabled.");
1058 	
1059 	                    } else if(debug == 3) {
1060 	                        /* Go nuts, turn on pacemaker's logging too */
1061 	                        sbd_log_filter_ctl("*", LOG_DEBUG);
1062 	                        cl_log(LOG_INFO, "Debug library mode enabled.");
1063 	                    }
1064 	                    break;
1065 			case 'T':
1066 				watchdog_set_timeout = 0;
1067 				cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults.");
1068 				break;
1069 			case 'W':
1070 				W_count++;
1071 				break;
1072 			case 'w':
1073 	                        free(watchdogdev);
1074 	                        watchdogdev = strdup(sanitized_optarg);
1075 	                        watchdogdev_is_default = false;
1076 	                        cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev);
1077 				break;
1078 			case 'd':
1079 	#if SUPPORT_SHARED_DISK
1080 				if (recruit_servant(sanitized_optarg, 0) != 0) {
1081 					fprintf(stderr, "Invalid device: %s\n", optarg);
1082 					exit_status = -1;
1083 					goto out;
1084 				}
1085 	#else
1086 	                        fprintf(stderr, "Shared disk functionality not supported\n");
1087 				exit_status = -2;
1088 				goto out;
1089 	#endif
1090 				break;
1091 			case 'c':
1092 				c_count++;
1093 				break;
1094 			case 'P':
1095 				P_count++;
1096 				break;
1097 			case 'z':
1098 				disk_priority = 0;
1099 				break;
1100 			case 'n':
1101 				local_uname = strdup(sanitized_optarg);
1102 				cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname);
1103 				break;
1104 			case 'p':
1105 				pidfile = strdup(sanitized_optarg);
1106 				cl_log(LOG_INFO, "pidfile set to %s", pidfile);
1107 				break;
1108 			case 'C':
1109 				timeout_watchdog_crashdump = sanitized_num_optarg;
1110 				cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d",
1111 						timeout_watchdog_crashdump);
1112 				break;
1113 			case '1':
1114 				timeout_watchdog = sanitized_num_optarg;
1115 				break;
1116 			case '2':
1117 				timeout_allocate = sanitized_num_optarg;
1118 				break;
1119 			case '3':
1120 				timeout_loop = sanitized_num_optarg;
1121 				break;
1122 			case '4':
1123 				timeout_msgwait = sanitized_num_optarg;
1124 				break;
1125 			case '5':
1126 				timeout_watchdog_warn = sanitized_num_optarg;
1127 				do_calculate_timeout_watchdog_warn = false;
1128 				cl_log(LOG_INFO, "Setting latency warning to %d",
1129 						timeout_watchdog_warn);
1130 				break;
1131 			case 't':
1132 				servant_restart_interval = sanitized_num_optarg;
1133 				cl_log(LOG_INFO, "Setting servant restart interval to %d",
1134 						(int)servant_restart_interval);
1135 				break;
1136 			case 'I':
1137 				timeout_io = sanitized_num_optarg;
1138 				cl_log(LOG_INFO, "Setting IO timeout to %d",
1139 						(int)timeout_io);
1140 				break;
1141 			case 'F':
1142 				servant_restart_count = sanitized_num_optarg;
1143 				cl_log(LOG_INFO, "Servant restart count set to %d",
1144 						(int)servant_restart_count);
1145 				break;
1146 			case 'r':
1147 				if (timeout_action) {
1148 					free(timeout_action);
1149 				}
1150 				timeout_action = strdup(sanitized_optarg);
1151 				break;
1152 			case 'h':
1153 				usage();
1154 				goto out;
1155 				break;
1156 			default:
1157 				exit_status = -2;
1158 				goto out;
1159 				break;
1160 			}
1161 		}
1162 	
1163 	    if (disk_count == 0) {
1164 	        /* if we already have disks from commandline
1165 	           then it is probably undesirable to add those
1166 	           from environment (general rule cmdline has precedence)
1167 	         */
1168 	        value = get_env_option("SBD_DEVICE");
1169 	        if ((value) && strlen(value)) {
1170 	#if SUPPORT_SHARED_DISK
1171 	            int devices = parse_device_line(value);
1172 	            if(devices < 1) {
1173 	                fprintf(stderr, "Invalid device line: %s\n", value);
1174 	                exit_status = -1;
1175 	                goto out;
1176 	            }
1177 	#else
1178 	            fprintf(stderr, "Shared disk functionality not supported\n");
1179 	            exit_status = -2;
1180 	            goto out;
1181 	#endif
1182 	        }
1183 		}
1184 	
1185 		if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) {
1186 	            watchdog_use = 0;
1187 	
1188 		} else if (W_count > 0) {
1189 	            watchdog_use = arg_enabled(W_count);
1190 	        }
1191 	
1192 		if (watchdog_use) {
1193 			cl_log(LOG_INFO, "Watchdog enabled.");
1194 		} else {
1195 			cl_log(LOG_INFO, "Watchdog disabled.");
1196 		}
1197 	
1198 		if (c_count > 0) {
1199 			check_cluster = arg_enabled(c_count);
1200 		}
1201 	
1202 		if (P_count > 0) {
1203 			int check_pcmk_arg = arg_enabled(P_count);
1204 	
1205 			if (has_check_pcmk_env && check_pcmk_arg != check_pcmk) {
1206 				cl_log(LOG_WARNING, "Pacemaker integration is %s: "
1207 						"SBD_PACEMAKER=%s is overridden by %s option. "
1208 						"It's recommended to only use SBD_PACEMAKER.",
1209 						check_pcmk_arg? "enabled" : "disabled",
1210 						check_pcmk? "yes" : "no",
1211 						check_pcmk_arg? "-P" : "-PP");
1212 			}
1213 			check_pcmk = check_pcmk_arg;
1214 		}
1215 	
1216 		if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) {
1217 			fprintf(stderr, "Node name mustn't be longer than %d chars.\n",
1218 				SECTOR_NAME_MAX);
1219 			fprintf(stderr, "If uname is longer define a name to be used by sbd.\n");
1220 			exit_status = -1;
1221 			goto out;
1222 		}
1223 	
1224 		if (disk_count > 3) {
1225 			fprintf(stderr, "You can specify up to 3 devices via the -d option.\n");
1226 			exit_status = -1;
1227 			goto out;
1228 		}
1229 	
1230 		/* There must at least be one command following the options: */
1231 		if ((argc - optind) < 1) {
1232 			fprintf(stderr, "Not enough arguments.\n");
1233 			exit_status = -2;
1234 			goto out;
1235 		}
1236 	
1237 		if (init_set_proc_title(argc, argv, envp) < 0) {
1238 			fprintf(stderr, "Allocation of proc title failed.\n");
1239 			exit_status = -1;
1240 			goto out;
1241 		}
1242 	
1243 		if (timeout_action) {
1244 			char *p[2];
1245 			int i;
1246 			char c;
1247 			int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c);
1248 			bool parse_error = (nrflags < 1) || (nrflags > 2);
1249 	
1250 			for (i = 0; (i < nrflags) && (i < 2); i++) {
1251 				if (!strcmp(p[i], "reboot")) {
1252 					timeout_sysrq_char = 'b';
1253 				} else if (!strcmp(p[i], "crashdump")) {
1254 					timeout_sysrq_char = 'c';
1255 				} else if (!strcmp(p[i], "off")) {
1256 					timeout_sysrq_char = 'o';
1257 				} else if (!strcmp(p[i], "flush")) {
1258 					do_flush = true;
1259 				} else if (!strcmp(p[i], "noflush")) {
1260 					do_flush = false;
1261 				} else {
1262 					parse_error = true;
1263 				}
1264 				free(p[i]);
1265 			}
1266 			if (parse_error) {
1267 				fprintf(stderr, "Failed to parse timeout-action \"%s\".\n",
1268 					timeout_action);
1269 				exit_status = -1;
1270 				goto out;
1271 			}
1272 		}
1273 	
1274 	    if (strcmp(argv[optind], "watch") == 0) {
1275 	        value = get_env_option("SBD_SYNC_RESOURCE_STARTUP");
1276 	        sync_resource_startup =
1277 	            crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT);
1278 	
1279 	#if !USE_PACEMAKERD_API
1280 	        if (sync_resource_startup) {
1281 	            fprintf(stderr, "Failed to sync resource-startup as "
1282 	                "SBD was built against pacemaker not supporting pacemakerd-API.\n");
1283 	            exit_status = -1;
1284 	            goto out;
1285 	        }
1286 	#else
1287 	        if (check_pcmk && !sync_resource_startup) {
1288 	            cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
1289 	                             "pacemakerd-API. Should think about enabling "
1290 	                             "SBD_SYNC_RESOURCE_STARTUP.");
1291 	
1292 	        } else if (!check_pcmk && sync_resource_startup) {
1293 	            fprintf(stderr, "Set SBD_PACEMAKER=yes to allow resource startup syncing. "
1294 	                    "Otherwise explicitly set SBD_SYNC_RESOURCE_STARTUP=no if to intentionally disable.\n");
1295 	            exit_status = -1;
1296 	            goto out;
1297 	        }
1298 	#endif
1299 	    }
1300 	
1301 	#if SUPPORT_SHARED_DISK
1302 		if (strcmp(argv[optind], "create") == 0) {
1303 			exit_status = init_devices(servants_leader);
1304 	
1305 	        } else if (strcmp(argv[optind], "dump") == 0) {
1306 			exit_status = dump_headers(servants_leader);
1307 	
1308 	        } else if (strcmp(argv[optind], "allocate") == 0) {
1309 	            exit_status = allocate_slots(argv[optind + 1], servants_leader);
1310 	
1311 	        } else if (strcmp(argv[optind], "list") == 0) {
1312 			exit_status = list_slots(servants_leader);
1313 	
1314 	        } else if (strcmp(argv[optind], "message") == 0) {
1315 	            exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader);
1316 	
1317 	        } else if (strcmp(argv[optind], "ping") == 0) {
1318 	            exit_status = ping_via_slots(argv[optind + 1], servants_leader);
1319 	
1320 	        } else
1321 	#endif
1322 	        if (strcmp(argv[optind], "query-watchdog") == 0) {
1323 	            exit_status = watchdog_info();
1324 	        } else if (strcmp(argv[optind], "test-watchdog") == 0) {
1325 	            exit_status = watchdog_test();
1326 	        } else if (strcmp(argv[optind], "watch") == 0) {
1327 	            /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */
1328 	
1329 	                const char *delay_source = delay ? "SBD_DELAY_START" : "";
1330 	
1331 	#if SUPPORT_SHARED_DISK
1332 	                if(disk_count > 0) {
1333 	                    /* If no devices are specified, its not an error to be unable to find one */
1334 	                    open_any_device(servants_leader);
1335 	
1336 	                    if (delay_start && delay <= 0) {
1337 	                        delay = get_first_msgwait(servants_leader);
1338 	
1339 	                        if (delay > 0) {
1340 	                            delay_source = "msgwait";
1341 	                        } else {
1342 	                            cl_log(LOG_WARNING, "No 'msgwait' value from disk, using '2 * watchdog-timeout' for 'delay' starting");
1343 	                        }
1344 	                    }
1345 	                }
1346 	#endif
1347 	                /* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from:
1348 	                 * SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */
1349 	                if (do_calculate_timeout_watchdog_warn) {
1350 	                    timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog);
1351 	                }
1352 	
1353 	                if (delay_start) {
1354 	                    /* diskless mode or disk read issues causing get_first_msgwait() to return a 0 for delay */
1355 	                    if (delay <= 0) {
1356 	                        delay = 2 * timeout_watchdog;
1357 	                        delay_source = "watchdog-timeout * 2";
1358 	                    }
1359 	
1360 	                    cl_log(LOG_DEBUG, "Delay start (yes), (delay: %ld), (delay source: %s)", delay, delay_source);
1361 	
1362 	                    sleep((unsigned long) delay);
1363 	
1364 	                } else {
1365 	                    cl_log(LOG_DEBUG, "Delay start (no)");
1366 	                }
1367 	
1368 	                /* We only want this to have an effect during watch right now;
1369 	                 * pinging and fencing would be too confused */
1370 	                cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk);
1371 	                if (check_pcmk) {
1372 	                        recruit_servant("pcmk", 0);
1373 	#if SUPPORT_PLUGIN
1374 	                        check_cluster = 1;
1375 	#endif
1376 	                }
1377 	
1378 	                cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster);
1379 	                if (check_cluster) {
1380 	                        recruit_servant("cluster", 0);
1381 	                }
1382 	
1383 	                cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout",
1384 	                       do_flush?"Do":"Skip", timeout_sysrq_char);
1385 	                exit_status = inquisitor();
1386 	        } else {
1387 	            exit_status = -2;
1388 	        }
1389 	        
1390 	  out:
1391 		if (timeout_action) {
1392 					free(timeout_action);
1393 		}
1394 		if (exit_status < 0) {
1395 			if (exit_status == -2) {
1396 				usage();
1397 			} else {
1398 				fprintf(stderr, "sbd failed; please check the logs.\n");
1399 			}
1400 			return (1);
1401 		}
1402 		return (0);
1403 	}
1404