1 /*
2 * Copyright 2004-2022 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU General Public License version 2
7 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8 */
9
10 #include <crm_internal.h>
11
12 #include <time.h>
13 #include <stdlib.h>
14
15 #include <crm/crm.h>
16 #include <crm/msg_xml.h>
17 #include <pacemaker-controld.h>
18
19 //! FSA mainloop timer type
20 typedef struct fsa_timer_s {
21 guint source_id; //!< Timer source ID
22 guint period_ms; //!< Timer period
23 enum crmd_fsa_input fsa_input; //!< Input to register if timer pops
24 gboolean (*callback) (gpointer data); //!< What do if timer pops
25 bool log_error; //!< Timer popping indicates error
26 int counter; //!< For detecting loops
27 } fsa_timer_t;
28
29 //! Wait before retrying a failed cib or executor connection
30 static fsa_timer_t *wait_timer = NULL;
31
32 //! Periodically re-run scheduler (for date_spec evaluation and as a failsafe)
33 static fsa_timer_t *recheck_timer = NULL;
34
35 //! Wait at start-up, or after an election, for DC to make contact
36 static fsa_timer_t *election_timer = NULL;
37
38 //! Delay start of new transition with expectation something else might happen
39 static fsa_timer_t *transition_timer = NULL;
40
41 //! join-integration-timeout
42 static fsa_timer_t *integration_timer = NULL;
43
44 //! join-finalization-timeout
45 static fsa_timer_t *finalization_timer = NULL;
46
47 // Wait for DC to stop all resources and give us the all-clear to shut down
48 fsa_timer_t *shutdown_escalation_timer = NULL;
49
50 //! Cluster recheck interval (from configuration)
51 static guint recheck_interval_ms = 0;
52
53 static const char *
54 get_timer_desc(fsa_timer_t * timer)
55 {
56 if (timer == election_timer) {
57 return "Election Trigger";
58
59 } else if (timer == shutdown_escalation_timer) {
60 return "Shutdown Escalation";
61
62 } else if (timer == integration_timer) {
63 return "Integration Timer";
64
65 } else if (timer == finalization_timer) {
66 return "Finalization Timer";
67
68 } else if (timer == transition_timer) {
69 return "New Transition Timer";
70
71 } else if (timer == wait_timer) {
72 return "Wait Timer";
73
74 } else if (timer == recheck_timer) {
75 return "Cluster Recheck Timer";
76
77 }
78 return "Unknown Timer";
79 }
80
81 /*!
82 * \internal
83 * \brief Stop an FSA timer
84 *
85 * \param[in,out] timer Timer to stop
86 *
87 * \return true if the timer was running, or false otherwise
88 */
89 static bool
90 controld_stop_timer(fsa_timer_t *timer)
91 {
92 CRM_CHECK(timer != NULL, return false);
93
94 if (timer->source_id != 0) {
95 crm_trace("Stopping %s (would inject %s if popped after %ums, src=%d)",
96 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
97 timer->period_ms, timer->source_id);
98 g_source_remove(timer->source_id);
99 timer->source_id = 0;
100
101 } else {
102 crm_trace("%s already stopped (would inject %s if popped after %ums)",
103 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
104 timer->period_ms);
105 return false;
106 }
107 return true;
108 }
109
110 /*!
111 * \internal
112 * \brief Start an FSA timer
113 *
114 * \param[in,out] timer Timer to start
115 */
116 static void
117 controld_start_timer(fsa_timer_t *timer)
118 {
119 if (timer->source_id == 0 && timer->period_ms > 0) {
120 timer->source_id = g_timeout_add(timer->period_ms, timer->callback, (void *)timer);
121 CRM_ASSERT(timer->source_id != 0);
122 crm_debug("Started %s (inject %s if pops after %ums, source=%d)",
123 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
124 timer->period_ms, timer->source_id);
125 } else {
126 crm_debug("%s already running (inject %s if pops after %ums, source=%d)",
127 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
128 timer->period_ms, timer->source_id);
129 }
130 }
131
132 /* A_DC_TIMER_STOP, A_DC_TIMER_START,
133 * A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START
134 * A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START
135 */
136 void
137 do_timer_control(long long action,
138 enum crmd_fsa_cause cause,
139 enum crmd_fsa_state cur_state,
140 enum crmd_fsa_input current_input, fsa_data_t * msg_data)
141 {
142 gboolean timer_op_ok = TRUE;
143
144 if (action & A_DC_TIMER_STOP) {
145 timer_op_ok = controld_stop_timer(election_timer);
146
147 } else if (action & A_FINALIZE_TIMER_STOP) {
148 timer_op_ok = controld_stop_timer(finalization_timer);
149
150 } else if (action & A_INTEGRATE_TIMER_STOP) {
151 timer_op_ok = controld_stop_timer(integration_timer);
152 }
153
154 /* don't start a timer that wasn't already running */
155 if (action & A_DC_TIMER_START && timer_op_ok) {
156 controld_start_timer(election_timer);
157 if (AM_I_DC) {
158 /* there can be only one */
159 register_fsa_input(cause, I_ELECTION, NULL);
160 }
161
162 } else if (action & A_FINALIZE_TIMER_START) {
163 controld_start_timer(finalization_timer);
164
165 } else if (action & A_INTEGRATE_TIMER_START) {
166 controld_start_timer(integration_timer);
167 }
168 }
169
170 static gboolean
171 crm_timer_popped(gpointer data)
172 {
173 fsa_timer_t *timer = (fsa_timer_t *) data;
174
175 if (timer->log_error) {
176 crm_err("%s just popped in state %s! " CRM_XS " input=%s time=%ums",
177 get_timer_desc(timer),
178 fsa_state2string(controld_globals.fsa_state),
179 fsa_input2string(timer->fsa_input), timer->period_ms);
180 } else {
181 crm_info("%s just popped " CRM_XS " input=%s time=%ums",
182 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
183 timer->period_ms);
184 timer->counter++;
185 }
186
187 if ((timer == election_timer) && (election_timer->counter > 5)) {
188 crm_notice("We appear to be in an election loop, something may be wrong");
189 crm_write_blackbox(0, NULL);
190 election_timer->counter = 0;
191 }
192
193 controld_stop_timer(timer); // Make timer _not_ go off again
194
195 if (timer->fsa_input == I_INTEGRATED) {
196 crm_info("Welcomed: %d, Integrated: %d",
197 crmd_join_phase_count(crm_join_welcomed),
198 crmd_join_phase_count(crm_join_integrated));
199 if (crmd_join_phase_count(crm_join_welcomed) == 0) {
200 // If we don't even have ourselves, start again
201 register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, NULL,
202 __func__);
203
204 } else {
205 register_fsa_input_before(C_TIMER_POPPED, timer->fsa_input, NULL);
206 }
207
208 } else if ((timer == recheck_timer)
209 && (controld_globals.fsa_state != S_IDLE)) {
210 crm_debug("Discarding %s event in state: %s",
211 fsa_input2string(timer->fsa_input),
212 fsa_state2string(controld_globals.fsa_state));
213
214 } else if ((timer == finalization_timer)
215 && (controld_globals.fsa_state != S_FINALIZE_JOIN)) {
216 crm_debug("Discarding %s event in state: %s",
217 fsa_input2string(timer->fsa_input),
218 fsa_state2string(controld_globals.fsa_state));
219
220 } else if (timer->fsa_input != I_NULL) {
221 register_fsa_input(C_TIMER_POPPED, timer->fsa_input, NULL);
222 }
223
224 controld_trigger_fsa();
225
226 return TRUE;
227 }
228
229 bool
230 controld_init_fsa_timers(void)
231 {
232 transition_timer = calloc(1, sizeof(fsa_timer_t));
233 if (transition_timer == NULL) {
234 return FALSE;
235 }
236
237 integration_timer = calloc(1, sizeof(fsa_timer_t));
238 if (integration_timer == NULL) {
239 return FALSE;
240 }
241
242 finalization_timer = calloc(1, sizeof(fsa_timer_t));
243 if (finalization_timer == NULL) {
244 return FALSE;
245 }
246
247 election_timer = calloc(1, sizeof(fsa_timer_t));
248 if (election_timer == NULL) {
249 return FALSE;
250 }
251
252 shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t));
253 if (shutdown_escalation_timer == NULL) {
254 return FALSE;
255 }
256
257 wait_timer = calloc(1, sizeof(fsa_timer_t));
258 if (wait_timer == NULL) {
259 return FALSE;
260 }
261
262 recheck_timer = calloc(1, sizeof(fsa_timer_t));
263 if (recheck_timer == NULL) {
264 return FALSE;
265 }
266
267 election_timer->source_id = 0;
268 election_timer->period_ms = 0;
269 election_timer->fsa_input = I_DC_TIMEOUT;
270 election_timer->callback = crm_timer_popped;
271 election_timer->log_error = FALSE;
272
273 transition_timer->source_id = 0;
274 transition_timer->period_ms = 0;
275 transition_timer->fsa_input = I_PE_CALC;
276 transition_timer->callback = crm_timer_popped;
277 transition_timer->log_error = FALSE;
278
279 integration_timer->source_id = 0;
280 integration_timer->period_ms = 0;
281 integration_timer->fsa_input = I_INTEGRATED;
282 integration_timer->callback = crm_timer_popped;
283 integration_timer->log_error = TRUE;
284
285 finalization_timer->source_id = 0;
286 finalization_timer->period_ms = 0;
287 finalization_timer->fsa_input = I_FINALIZED;
288 finalization_timer->callback = crm_timer_popped;
289 finalization_timer->log_error = FALSE;
290
291 /* We can't use I_FINALIZED here, because that creates a bug in the join
292 * process where a joining node can be stuck in S_PENDING while we think it
293 * is in S_NOT_DC. This created an infinite transition loop in which we
294 * continually send probes which the node NACKs because it's pending.
295 *
296 * If we have nodes where the cluster layer is active but the controller is
297 * not, we can avoid this causing an election/join loop, in the integration
298 * phase.
299 */
300 finalization_timer->fsa_input = I_ELECTION;
301
302 shutdown_escalation_timer->source_id = 0;
303 shutdown_escalation_timer->period_ms = 0;
304 shutdown_escalation_timer->fsa_input = I_STOP;
305 shutdown_escalation_timer->callback = crm_timer_popped;
306 shutdown_escalation_timer->log_error = TRUE;
307
308 wait_timer->source_id = 0;
309 wait_timer->period_ms = 2000;
310 wait_timer->fsa_input = I_NULL;
311 wait_timer->callback = crm_timer_popped;
312 wait_timer->log_error = FALSE;
313
314 recheck_timer->source_id = 0;
315 recheck_timer->period_ms = 0;
316 recheck_timer->fsa_input = I_PE_CALC;
317 recheck_timer->callback = crm_timer_popped;
318 recheck_timer->log_error = FALSE;
319
320 return TRUE;
321 }
322
323 /*!
324 * \internal
325 * \brief Configure timers based on the CIB
326 *
327 * \param[in,out] options Name/value pairs for configured options
328 */
329 void
330 controld_configure_fsa_timers(GHashTable *options)
331 {
332 const char *value = NULL;
333
334 // Election timer
335 value = g_hash_table_lookup(options, XML_CONFIG_ATTR_DC_DEADTIME);
336 election_timer->period_ms = crm_parse_interval_spec(value);
337
338 // Integration timer
339 value = g_hash_table_lookup(options, "join-integration-timeout");
340 integration_timer->period_ms = crm_parse_interval_spec(value);
341
342 // Finalization timer
343 value = g_hash_table_lookup(options, "join-finalization-timeout");
344 finalization_timer->period_ms = crm_parse_interval_spec(value);
345
346 // Shutdown escalation timer
347 value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FORCE_QUIT);
348 shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value);
349 crm_debug("Shutdown escalation occurs if DC has not responded to request "
350 "in %ums", shutdown_escalation_timer->period_ms);
351
352 // Transition timer
353 value = g_hash_table_lookup(options, "transition-delay");
354 transition_timer->period_ms = crm_parse_interval_spec(value);
355
356 // Recheck interval
357 value = g_hash_table_lookup(options, XML_CONFIG_ATTR_RECHECK);
358 recheck_interval_ms = crm_parse_interval_spec(value);
359 crm_debug("Re-run scheduler after %dms of inactivity", recheck_interval_ms);
360 }
361
362 void
363 controld_free_fsa_timers(void)
364 {
365 controld_stop_timer(transition_timer);
366 controld_stop_timer(integration_timer);
367 controld_stop_timer(finalization_timer);
368 controld_stop_timer(election_timer);
369 controld_stop_timer(shutdown_escalation_timer);
370 controld_stop_timer(wait_timer);
371 controld_stop_timer(recheck_timer);
372
373 free(transition_timer); transition_timer = NULL;
374 free(integration_timer); integration_timer = NULL;
375 free(finalization_timer); finalization_timer = NULL;
376 free(election_timer); election_timer = NULL;
377 free(shutdown_escalation_timer); shutdown_escalation_timer = NULL;
378 free(wait_timer); wait_timer = NULL;
379 free(recheck_timer); recheck_timer = NULL;
380 }
381
382 /*!
383 * \internal
384 * \brief Check whether the transition timer is started
385 * \return true if the transition timer is started, or false otherwise
386 */
387 bool
388 controld_is_started_transition_timer(void)
389 {
390 return (transition_timer->period_ms > 0)
391 && (transition_timer->source_id != 0);
392 }
393
394 /*!
395 * \internal
396 * \brief Start the recheck timer
397 */
398 void
399 controld_start_recheck_timer(void)
400 {
401 // Default to recheck interval configured in CIB (if any)
402 guint period_ms = recheck_interval_ms;
403
404 // If scheduler supplied a "recheck by" time, check whether that's sooner
405 if (controld_globals.transition_graph->recheck_by > 0) {
406 time_t diff_seconds = controld_globals.transition_graph->recheck_by
407 - time(NULL);
408
409 if (diff_seconds < 1) {
410 // We're already past the desired time
411 period_ms = 500;
412 } else {
|
CID (unavailable; MK=16033bdc87f0ca68b9c8f070d4eb9763) (#1 of 1): Use of 32-bit time_t (Y2K38_SAFETY): |
|
(1) Event store_truncates_time_t: |
A "time_t" value is stored in an integer with too few bits to accommodate it. The expression "diff_seconds" is cast to "guint". |
413 period_ms = (guint) diff_seconds * 1000;
414 }
415
416 // Use "recheck by" only if it's sooner than interval from CIB
417 if (period_ms > recheck_interval_ms) {
418 period_ms = recheck_interval_ms;
419 }
420 }
421
422 if (period_ms > 0) {
423 recheck_timer->period_ms = period_ms;
424 controld_start_timer(recheck_timer);
425 }
426 }
427
428 /*!
429 * \internal
430 * \brief Start the wait timer
431 */
432 void
433 controld_start_wait_timer(void)
434 {
435 controld_start_timer(wait_timer);
436 }
437
438 /*!
439 * \internal
440 * \brief Stop the recheck timer
441 *
442 * \return true if the recheck timer was running, or false otherwise
443 */
444 bool
445 controld_stop_recheck_timer(void)
446 {
447 return controld_stop_timer(recheck_timer);
448 }
449
450 /*!
451 * \brief Get the transition timer's configured period
452 * \return The transition_timer's period
453 */
454 guint
455 controld_get_period_transition_timer(void)
456 {
457 return transition_timer->period_ms;
458 }
459
460 /*!
461 * \internal
462 * \brief Reset the election timer's counter to 0
463 */
464 void
465 controld_reset_counter_election_timer(void)
466 {
467 election_timer->counter = 0;
468 }
469
470 /*!
471 * \internal
472 * \brief Stop the transition timer
473 *
474 * \return true if the transition timer was running, or false otherwise
475 */
476 bool
477 controld_stop_transition_timer(void)
478 {
479 return controld_stop_timer(transition_timer);
480 }
481
482 /*!
483 * \internal
484 * \brief Start the transition timer
485 */
486 void
487 controld_start_transition_timer(void)
488 {
489 controld_start_timer(transition_timer);
490 }
491
492 /*!
493 * \internal
494 * \brief Start the countdown sequence for a shutdown
495 *
496 * \param[in] default_period_ms Period to use if the shutdown escalation
497 * timer's period is 0
498 */
499 void
500 controld_shutdown_start_countdown(guint default_period_ms)
501 {
502 if (shutdown_escalation_timer->period_ms == 0) {
503 shutdown_escalation_timer->period_ms = default_period_ms;
504 }
505
506 crm_notice("Initiating controller shutdown sequence " CRM_XS " limit=%ums",
507 shutdown_escalation_timer->period_ms);
508 controld_start_timer(shutdown_escalation_timer);
509 }
510