1    	/*
2    	 * Copyright 2004-2022 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <time.h>
13   	#include <stdlib.h>
14   	
15   	#include <crm/crm.h>
16   	#include <crm/msg_xml.h>
17   	#include <pacemaker-controld.h>
18   	
19   	//! FSA mainloop timer type
20   	typedef struct fsa_timer_s {
21   	    guint source_id;                        //!< Timer source ID
22   	    guint period_ms;                        //!< Timer period
23   	    enum crmd_fsa_input fsa_input;          //!< Input to register if timer pops
24   	    gboolean (*callback) (gpointer data);   //!< What do if timer pops
25   	    bool log_error;                         //!< Timer popping indicates error
26   	    int counter;                            //!< For detecting loops
27   	} fsa_timer_t;
28   	
29   	//! Wait before retrying a failed cib or executor connection
30   	static fsa_timer_t *wait_timer = NULL;
31   	
32   	//! Periodically re-run scheduler (for date_spec evaluation and as a failsafe)
33   	static fsa_timer_t *recheck_timer = NULL;
34   	
35   	//! Wait at start-up, or after an election, for DC to make contact
36   	static fsa_timer_t *election_timer = NULL;
37   	
38   	//! Delay start of new transition with expectation something else might happen
39   	static fsa_timer_t *transition_timer = NULL;
40   	
41   	//! join-integration-timeout
42   	static fsa_timer_t *integration_timer = NULL;
43   	
44   	//! join-finalization-timeout
45   	static fsa_timer_t *finalization_timer = NULL;
46   	
47   	// Wait for DC to stop all resources and give us the all-clear to shut down
48   	fsa_timer_t *shutdown_escalation_timer = NULL;
49   	
50   	//! Cluster recheck interval (from configuration)
51   	static guint recheck_interval_ms = 0;
52   	
53   	static const char *
54   	get_timer_desc(fsa_timer_t * timer)
55   	{
56   	    if (timer == election_timer) {
57   	        return "Election Trigger";
58   	
59   	    } else if (timer == shutdown_escalation_timer) {
60   	        return "Shutdown Escalation";
61   	
62   	    } else if (timer == integration_timer) {
63   	        return "Integration Timer";
64   	
65   	    } else if (timer == finalization_timer) {
66   	        return "Finalization Timer";
67   	
68   	    } else if (timer == transition_timer) {
69   	        return "New Transition Timer";
70   	
71   	    } else if (timer == wait_timer) {
72   	        return "Wait Timer";
73   	
74   	    } else if (timer == recheck_timer) {
75   	        return "Cluster Recheck Timer";
76   	
77   	    }
78   	    return "Unknown Timer";
79   	}
80   	
81   	/*!
82   	 * \internal
83   	 * \brief Stop an FSA timer
84   	 *
85   	 * \param[in,out] timer  Timer to stop
86   	 *
87   	 * \return true if the timer was running, or false otherwise
88   	 */
89   	static bool
90   	controld_stop_timer(fsa_timer_t *timer)
91   	{
92   	    CRM_CHECK(timer != NULL, return false);
93   	
94   	    if (timer->source_id != 0) {
95   	        crm_trace("Stopping %s (would inject %s if popped after %ums, src=%d)",
96   	                  get_timer_desc(timer), fsa_input2string(timer->fsa_input),
97   	                  timer->period_ms, timer->source_id);
98   	        g_source_remove(timer->source_id);
99   	        timer->source_id = 0;
100  	
101  	    } else {
102  	        crm_trace("%s already stopped (would inject %s if popped after %ums)",
103  	                  get_timer_desc(timer), fsa_input2string(timer->fsa_input),
104  	                  timer->period_ms);
105  	        return false;
106  	    }
107  	    return true;
108  	}
109  	
110  	/*!
111  	 * \internal
112  	 * \brief Start an FSA timer
113  	 *
114  	 * \param[in,out] timer  Timer to start
115  	 */
116  	static void
117  	controld_start_timer(fsa_timer_t *timer)
118  	{
119  	    if (timer->source_id == 0 && timer->period_ms > 0) {
120  	        timer->source_id = g_timeout_add(timer->period_ms, timer->callback, (void *)timer);
121  	        CRM_ASSERT(timer->source_id != 0);
122  	        crm_debug("Started %s (inject %s if pops after %ums, source=%d)",
123  	                  get_timer_desc(timer), fsa_input2string(timer->fsa_input),
124  	                  timer->period_ms, timer->source_id);
125  	    } else {
126  	        crm_debug("%s already running (inject %s if pops after %ums, source=%d)",
127  	                  get_timer_desc(timer), fsa_input2string(timer->fsa_input),
128  	                  timer->period_ms, timer->source_id);
129  	    }
130  	}
131  	
132  	/*	A_DC_TIMER_STOP, A_DC_TIMER_START,
133  	 *	A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START
134  	 *	A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START
135  	 */
136  	void
137  	do_timer_control(long long action,
138  	                 enum crmd_fsa_cause cause,
139  	                 enum crmd_fsa_state cur_state,
140  	                 enum crmd_fsa_input current_input, fsa_data_t * msg_data)
141  	{
142  	    gboolean timer_op_ok = TRUE;
143  	
144  	    if (action & A_DC_TIMER_STOP) {
145  	        timer_op_ok = controld_stop_timer(election_timer);
146  	
147  	    } else if (action & A_FINALIZE_TIMER_STOP) {
148  	        timer_op_ok = controld_stop_timer(finalization_timer);
149  	
150  	    } else if (action & A_INTEGRATE_TIMER_STOP) {
151  	        timer_op_ok = controld_stop_timer(integration_timer);
152  	    }
153  	
154  	    /* don't start a timer that wasn't already running */
155  	    if (action & A_DC_TIMER_START && timer_op_ok) {
156  	        controld_start_timer(election_timer);
157  	        if (AM_I_DC) {
158  	            /* there can be only one */
159  	            register_fsa_input(cause, I_ELECTION, NULL);
160  	        }
161  	
162  	    } else if (action & A_FINALIZE_TIMER_START) {
163  	        controld_start_timer(finalization_timer);
164  	
165  	    } else if (action & A_INTEGRATE_TIMER_START) {
166  	        controld_start_timer(integration_timer);
167  	    }
168  	}
169  	
170  	static gboolean
171  	crm_timer_popped(gpointer data)
172  	{
173  	    fsa_timer_t *timer = (fsa_timer_t *) data;
174  	
175  	    if (timer->log_error) {
176  	        crm_err("%s just popped in state %s! " CRM_XS " input=%s time=%ums",
177  	                get_timer_desc(timer),
178  	                fsa_state2string(controld_globals.fsa_state),
179  	                fsa_input2string(timer->fsa_input), timer->period_ms);
180  	    } else {
181  	        crm_info("%s just popped " CRM_XS " input=%s time=%ums",
182  	                 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
183  	                 timer->period_ms);
184  	        timer->counter++;
185  	    }
186  	
187  	    if ((timer == election_timer) && (election_timer->counter > 5)) {
188  	        crm_notice("We appear to be in an election loop, something may be wrong");
189  	        crm_write_blackbox(0, NULL);
190  	        election_timer->counter = 0;
191  	    }
192  	
193  	    controld_stop_timer(timer);  // Make timer _not_ go off again
194  	
195  	    if (timer->fsa_input == I_INTEGRATED) {
196  	        crm_info("Welcomed: %d, Integrated: %d",
197  	                 crmd_join_phase_count(crm_join_welcomed),
198  	                 crmd_join_phase_count(crm_join_integrated));
199  	        if (crmd_join_phase_count(crm_join_welcomed) == 0) {
200  	            // If we don't even have ourselves, start again
201  	            register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, NULL,
202  	                                   __func__);
203  	
204  	        } else {
205  	            register_fsa_input_before(C_TIMER_POPPED, timer->fsa_input, NULL);
206  	        }
207  	
208  	    } else if ((timer == recheck_timer)
209  	               && (controld_globals.fsa_state != S_IDLE)) {
210  	        crm_debug("Discarding %s event in state: %s",
211  	                  fsa_input2string(timer->fsa_input),
212  	                  fsa_state2string(controld_globals.fsa_state));
213  	
214  	    } else if ((timer == finalization_timer)
215  	               && (controld_globals.fsa_state != S_FINALIZE_JOIN)) {
216  	        crm_debug("Discarding %s event in state: %s",
217  	                  fsa_input2string(timer->fsa_input),
218  	                  fsa_state2string(controld_globals.fsa_state));
219  	
220  	    } else if (timer->fsa_input != I_NULL) {
221  	        register_fsa_input(C_TIMER_POPPED, timer->fsa_input, NULL);
222  	    }
223  	
224  	    controld_trigger_fsa();
225  	
226  	    return TRUE;
227  	}
228  	
229  	bool
230  	controld_init_fsa_timers(void)
231  	{
232  	    transition_timer = calloc(1, sizeof(fsa_timer_t));
233  	    if (transition_timer == NULL) {
234  	        return FALSE;
235  	    }
236  	
237  	    integration_timer = calloc(1, sizeof(fsa_timer_t));
238  	    if (integration_timer == NULL) {
239  	        return FALSE;
240  	    }
241  	
242  	    finalization_timer = calloc(1, sizeof(fsa_timer_t));
243  	    if (finalization_timer == NULL) {
244  	        return FALSE;
245  	    }
246  	
247  	    election_timer = calloc(1, sizeof(fsa_timer_t));
248  	    if (election_timer == NULL) {
249  	        return FALSE;
250  	    }
251  	
252  	    shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t));
253  	    if (shutdown_escalation_timer == NULL) {
254  	        return FALSE;
255  	    }
256  	
257  	    wait_timer = calloc(1, sizeof(fsa_timer_t));
258  	    if (wait_timer == NULL) {
259  	        return FALSE;
260  	    }
261  	
262  	    recheck_timer = calloc(1, sizeof(fsa_timer_t));
263  	    if (recheck_timer == NULL) {
264  	        return FALSE;
265  	    }
266  	
267  	    election_timer->source_id = 0;
268  	    election_timer->period_ms = 0;
269  	    election_timer->fsa_input = I_DC_TIMEOUT;
270  	    election_timer->callback = crm_timer_popped;
271  	    election_timer->log_error = FALSE;
272  	
273  	    transition_timer->source_id = 0;
274  	    transition_timer->period_ms = 0;
275  	    transition_timer->fsa_input = I_PE_CALC;
276  	    transition_timer->callback = crm_timer_popped;
277  	    transition_timer->log_error = FALSE;
278  	
279  	    integration_timer->source_id = 0;
280  	    integration_timer->period_ms = 0;
281  	    integration_timer->fsa_input = I_INTEGRATED;
282  	    integration_timer->callback = crm_timer_popped;
283  	    integration_timer->log_error = TRUE;
284  	
285  	    finalization_timer->source_id = 0;
286  	    finalization_timer->period_ms = 0;
287  	    finalization_timer->fsa_input = I_FINALIZED;
288  	    finalization_timer->callback = crm_timer_popped;
289  	    finalization_timer->log_error = FALSE;
290  	
291  	    /* We can't use I_FINALIZED here, because that creates a bug in the join
292  	     * process where a joining node can be stuck in S_PENDING while we think it
293  	     * is in S_NOT_DC. This created an infinite transition loop in which we
294  	     * continually send probes which the node NACKs because it's pending.
295  	     *
296  	     * If we have nodes where the cluster layer is active but the controller is
297  	     * not, we can avoid this causing an election/join loop, in the integration
298  	     * phase.
299  	     */
300  	    finalization_timer->fsa_input = I_ELECTION;
301  	
302  	    shutdown_escalation_timer->source_id = 0;
303  	    shutdown_escalation_timer->period_ms = 0;
304  	    shutdown_escalation_timer->fsa_input = I_STOP;
305  	    shutdown_escalation_timer->callback = crm_timer_popped;
306  	    shutdown_escalation_timer->log_error = TRUE;
307  	
308  	    wait_timer->source_id = 0;
309  	    wait_timer->period_ms = 2000;
310  	    wait_timer->fsa_input = I_NULL;
311  	    wait_timer->callback = crm_timer_popped;
312  	    wait_timer->log_error = FALSE;
313  	
314  	    recheck_timer->source_id = 0;
315  	    recheck_timer->period_ms = 0;
316  	    recheck_timer->fsa_input = I_PE_CALC;
317  	    recheck_timer->callback = crm_timer_popped;
318  	    recheck_timer->log_error = FALSE;
319  	
320  	    return TRUE;
321  	}
322  	
323  	/*!
324  	 * \internal
325  	 * \brief Configure timers based on the CIB
326  	 *
327  	 * \param[in,out] options  Name/value pairs for configured options
328  	 */
329  	void
330  	controld_configure_fsa_timers(GHashTable *options)
331  	{
332  	    const char *value = NULL;
333  	
334  	    // Election timer
335  	    value = g_hash_table_lookup(options, XML_CONFIG_ATTR_DC_DEADTIME);
336  	    election_timer->period_ms = crm_parse_interval_spec(value);
337  	
338  	    // Integration timer
339  	    value = g_hash_table_lookup(options, "join-integration-timeout");
340  	    integration_timer->period_ms = crm_parse_interval_spec(value);
341  	
342  	    // Finalization timer
343  	    value = g_hash_table_lookup(options, "join-finalization-timeout");
344  	    finalization_timer->period_ms = crm_parse_interval_spec(value);
345  	
346  	    // Shutdown escalation timer
347  	    value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FORCE_QUIT);
348  	    shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value);
349  	    crm_debug("Shutdown escalation occurs if DC has not responded to request "
350  	              "in %ums", shutdown_escalation_timer->period_ms);
351  	
352  	    // Transition timer
353  	    value = g_hash_table_lookup(options, "transition-delay");
354  	    transition_timer->period_ms = crm_parse_interval_spec(value);
355  	
356  	    // Recheck interval
357  	    value = g_hash_table_lookup(options, XML_CONFIG_ATTR_RECHECK);
358  	    recheck_interval_ms = crm_parse_interval_spec(value);
359  	    crm_debug("Re-run scheduler after %dms of inactivity", recheck_interval_ms);
360  	}
361  	
362  	void
363  	controld_free_fsa_timers(void)
364  	{
365  	    controld_stop_timer(transition_timer);
366  	    controld_stop_timer(integration_timer);
367  	    controld_stop_timer(finalization_timer);
368  	    controld_stop_timer(election_timer);
369  	    controld_stop_timer(shutdown_escalation_timer);
370  	    controld_stop_timer(wait_timer);
371  	    controld_stop_timer(recheck_timer);
372  	
373  	    free(transition_timer); transition_timer = NULL;
374  	    free(integration_timer); integration_timer = NULL;
375  	    free(finalization_timer); finalization_timer = NULL;
376  	    free(election_timer); election_timer = NULL;
377  	    free(shutdown_escalation_timer); shutdown_escalation_timer = NULL;
378  	    free(wait_timer); wait_timer = NULL;
379  	    free(recheck_timer); recheck_timer = NULL;
380  	}
381  	
382  	/*!
383  	 * \internal
384  	 * \brief Check whether the transition timer is started
385  	 * \return true if the transition timer is started, or false otherwise
386  	 */
387  	bool
388  	controld_is_started_transition_timer(void)
389  	{
390  	    return (transition_timer->period_ms > 0)
391  	           && (transition_timer->source_id != 0);
392  	}
393  	
394  	/*!
395  	 * \internal
396  	 * \brief Start the recheck timer
397  	 */
398  	void
399  	controld_start_recheck_timer(void)
400  	{
401  	    // Default to recheck interval configured in CIB (if any)
402  	    guint period_ms = recheck_interval_ms;
403  	
404  	    // If scheduler supplied a "recheck by" time, check whether that's sooner
405  	    if (controld_globals.transition_graph->recheck_by > 0) {
406  	        time_t diff_seconds = controld_globals.transition_graph->recheck_by
407  	                              - time(NULL);
408  	
409  	        if (diff_seconds < 1) {
410  	            // We're already past the desired time
411  	            period_ms = 500;
412  	        } else {
CID (unavailable; MK=16033bdc87f0ca68b9c8f070d4eb9763) (#1 of 1): Use of 32-bit time_t (Y2K38_SAFETY):
(1) Event store_truncates_time_t: A "time_t" value is stored in an integer with too few bits to accommodate it. The expression "diff_seconds" is cast to "guint".
413  	            period_ms = (guint) diff_seconds * 1000;
414  	        }
415  	
416  	        // Use "recheck by" only if it's sooner than interval from CIB
417  	        if (period_ms > recheck_interval_ms) {
418  	            period_ms = recheck_interval_ms;
419  	        }
420  	    }
421  	
422  	    if (period_ms > 0) {
423  	        recheck_timer->period_ms = period_ms;
424  	        controld_start_timer(recheck_timer);
425  	    }
426  	}
427  	
428  	/*!
429  	 * \internal
430  	 * \brief Start the wait timer
431  	 */
432  	void
433  	controld_start_wait_timer(void)
434  	{
435  	    controld_start_timer(wait_timer);
436  	}
437  	
438  	/*!
439  	 * \internal
440  	 * \brief Stop the recheck timer
441  	 *
442  	 * \return true if the recheck timer was running, or false otherwise
443  	 */
444  	bool
445  	controld_stop_recheck_timer(void)
446  	{
447  	    return controld_stop_timer(recheck_timer);
448  	}
449  	
450  	/*!
451  	 * \brief Get the transition timer's configured period
452  	 * \return The transition_timer's period
453  	 */
454  	guint
455  	controld_get_period_transition_timer(void)
456  	{
457  	    return transition_timer->period_ms;
458  	}
459  	
460  	/*!
461  	 * \internal
462  	 * \brief Reset the election timer's counter to 0
463  	 */
464  	void
465  	controld_reset_counter_election_timer(void)
466  	{
467  	    election_timer->counter = 0;
468  	}
469  	
470  	/*!
471  	 * \internal
472  	 * \brief Stop the transition timer
473  	 *
474  	 * \return true if the transition timer was running, or false otherwise
475  	 */
476  	bool
477  	controld_stop_transition_timer(void)
478  	{
479  	    return controld_stop_timer(transition_timer);
480  	}
481  	
482  	/*!
483  	 * \internal
484  	 * \brief Start the transition timer
485  	 */
486  	void
487  	controld_start_transition_timer(void)
488  	{
489  	    controld_start_timer(transition_timer);
490  	}
491  	
492  	/*!
493  	 * \internal
494  	 * \brief Start the countdown sequence for a shutdown
495  	 *
496  	 * \param[in] default_period_ms  Period to use if the shutdown escalation
497  	 *                               timer's period is 0
498  	 */
499  	void
500  	controld_shutdown_start_countdown(guint default_period_ms)
501  	{
502  	    if (shutdown_escalation_timer->period_ms == 0) {
503  	        shutdown_escalation_timer->period_ms = default_period_ms;
504  	    }
505  	
506  	    crm_notice("Initiating controller shutdown sequence " CRM_XS " limit=%ums",
507  	               shutdown_escalation_timer->period_ms);
508  	    controld_start_timer(shutdown_escalation_timer);
509  	}
510