1 /*
2 * Copyright 2004-2026 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU General Public License version 2
7 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8 */
9
10 #include <crm_internal.h>
11
12 #include <time.h>
13 #include <stdbool.h>
14 #include <stdlib.h>
15 #include <glib.h>
16
17 #include <crm/crm.h>
18 #include <crm/common/xml.h>
19 #include <pacemaker-controld.h>
20
21 //! FSA mainloop timer type
22 typedef struct {
23 guint source_id; //!< Timer source ID
24 guint period_ms; //!< Timer period
25 enum crmd_fsa_input fsa_input; //!< Input to register if timer pops
26 gboolean (*callback) (gpointer data); //!< What do if timer pops
27 bool log_error; //!< Timer popping indicates error
28 int counter; //!< For detecting loops
29 } fsa_timer_t;
30
31 //! Wait before retrying a failed cib or executor connection
32 static fsa_timer_t *wait_timer = NULL;
33
34 //! Periodically re-run scheduler (for date_spec evaluation and as a failsafe)
35 static fsa_timer_t *recheck_timer = NULL;
36
37 //! Wait at start-up, or after an election, for DC to make contact
38 static fsa_timer_t *election_timer = NULL;
39
40 //! Delay start of new transition with expectation something else might happen
41 static fsa_timer_t *transition_timer = NULL;
42
43 //! \c PCMK_OPT_JOIN_INTEGRATION_TIMEOUT
44 static fsa_timer_t *integration_timer = NULL;
45
46 //! \c PCMK_OPT_JOIN_FINALIZATION_TIMEOUT
47 static fsa_timer_t *finalization_timer = NULL;
48
49 // Wait for DC to stop all resources and give us the all-clear to shut down
50 fsa_timer_t *shutdown_escalation_timer = NULL;
51
52 //! Cluster recheck interval (from configuration)
53 static guint recheck_interval_ms = 0;
54
55 static const char *
56 get_timer_desc(fsa_timer_t * timer)
57 {
58 if (timer == election_timer) {
59 return "Election Trigger";
60
61 } else if (timer == shutdown_escalation_timer) {
62 return "Shutdown Escalation";
63
64 } else if (timer == integration_timer) {
65 return "Integration Timer";
66
67 } else if (timer == finalization_timer) {
68 return "Finalization Timer";
69
70 } else if (timer == transition_timer) {
71 return "New Transition Timer";
72
73 } else if (timer == wait_timer) {
74 return "Wait Timer";
75
76 } else if (timer == recheck_timer) {
77 return "Cluster Recheck Timer";
78
79 }
80 return "Unknown Timer";
81 }
82
83 /*!
84 * \internal
85 * \brief Stop an FSA timer
86 *
87 * \param[in,out] timer Timer to stop
88 *
89 * \return true if the timer was running, or false otherwise
90 */
91 static bool
92 controld_stop_timer(fsa_timer_t *timer)
93 {
94 CRM_CHECK(timer != NULL, return false);
95
96 if (timer->source_id != 0) {
97 pcmk__trace("Stopping %s (would inject %s if popped after %ums, "
98 "src=%d)",
99 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
100 timer->period_ms, timer->source_id);
101 g_source_remove(timer->source_id);
102 timer->source_id = 0;
103 return true;
104 }
105
106 pcmk__trace("%s already stopped (would inject %s if popped after %ums)",
107 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
108 timer->period_ms);
109 return false;
110 }
111
112 /*!
113 * \internal
114 * \brief Start an FSA timer
115 *
116 * \param[in,out] timer Timer to start
117 */
118 static void
119 controld_start_timer(fsa_timer_t *timer)
120 {
121 if (timer->source_id == 0 && timer->period_ms > 0) {
122 timer->source_id = pcmk__create_timer(timer->period_ms, timer->callback, timer);
123 pcmk__assert(timer->source_id != 0);
124 pcmk__debug("Started %s (inject %s if pops after %ums, source=%d)",
125 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
126 timer->period_ms, timer->source_id);
127 } else {
128 pcmk__debug("%s already running (inject %s if pops after %ums, "
129 "source=%d)",
130 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
131 timer->period_ms, timer->source_id);
132 }
133 }
134
135 /* A_DC_TIMER_STOP, A_DC_TIMER_START,
136 * A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START
137 * A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START
138 */
139 void
140 do_timer_control(long long action, enum crmd_fsa_cause cause,
141 enum crmd_fsa_state cur_state,
142 enum crmd_fsa_input current_input, fsa_data_t *msg_data)
143 {
144 /* @FIXME It doesn't appear to make sense that we set timer_op_ok based on
145 * stopping the finalization and integration timers. We check it only if
146 * A_DC_TIMER_START is set.
147 *
148 * This behavior goes back to 7637ade9 in 2004 and looks like a bug. We
149 * probably should do one of the following:
150 * - Check timer_op_ok for finalization and integration timer starts.
151 * - Don't set timer_op_ok for finalization and integration timer stops.
152 * This would prevent those results from affecting whether we start the DC
153 * timer.
154 *
155 * Related to the above, there should probably be some sort of check to
156 * ensure that this function is not stopping one timer and starting a
157 * different timer, unless that is expected behavior. Or we could have
158 * separate handler functions for each timer. Otherwise, we could encounter
159 * a situation where:
160 * - We want to stop Timer A and start Timer B.
161 * - Timer A is not running, so timer_op_ok gets set to false.
162 * - We skip starting Timer B because Timer A was not running.
163 *
164 * This situation doesn't seem right. (Currently, "Timer B" could only be
165 * the DC timer, since the other timer starts don't check timer_op_ok.)
166 */
167 bool timer_op_ok = true;
168
169 if (pcmk__is_set(action, A_DC_TIMER_STOP)) {
170 timer_op_ok = controld_stop_timer(election_timer);
171
172 } else if (pcmk__is_set(action, A_FINALIZE_TIMER_STOP)) {
173 timer_op_ok = controld_stop_timer(finalization_timer);
174
175 } else if (pcmk__is_set(action, A_INTEGRATE_TIMER_STOP)) {
176 timer_op_ok = controld_stop_timer(integration_timer);
177 }
178
179 // Don't start a timer that wasn't already running
180 if (pcmk__is_set(action, A_DC_TIMER_START) && timer_op_ok) {
181 controld_start_timer(election_timer);
182 if (AM_I_DC) {
183 // Trigger an election to ensure there is only one DC
184 controld_fsa_append(cause, I_ELECTION, NULL);
185 }
186
187 } else if (pcmk__is_set(action, A_FINALIZE_TIMER_START)) {
188 controld_start_timer(finalization_timer);
189
190 } else if (pcmk__is_set(action, A_INTEGRATE_TIMER_START)) {
191 controld_start_timer(integration_timer);
192 }
193 }
194
195 static gboolean
196 crm_timer_popped(gpointer data)
197 {
198 fsa_timer_t *timer = (fsa_timer_t *) data;
199
200 if (timer->log_error) {
201 pcmk__err("%s just popped in state %s! " QB_XS " input=%s time=%ums",
202 get_timer_desc(timer),
203 fsa_state2string(controld_globals.fsa_state),
204 fsa_input2string(timer->fsa_input), timer->period_ms);
205 } else {
206 pcmk__info("%s just popped " QB_XS " input=%s time=%ums",
207 get_timer_desc(timer), fsa_input2string(timer->fsa_input),
208 timer->period_ms);
209 timer->counter++;
210 }
211
212 if ((timer == election_timer) && (election_timer->counter > 5)) {
213 pcmk__notice("We appear to be in an election loop, something may be "
214 "wrong");
215 crm_write_blackbox(0, NULL);
216 election_timer->counter = 0;
217 }
218
219 controld_stop_timer(timer); // Make timer _not_ go off again
220
221 if (timer->fsa_input == I_INTEGRATED) {
222 pcmk__info("Welcomed: %d, Integrated: %d",
223 crmd_join_phase_count(controld_join_welcomed),
224 crmd_join_phase_count(controld_join_integrated));
225 if (crmd_join_phase_count(controld_join_welcomed) == 0) {
226 // If we don't even have ourselves, start again
227 register_fsa_error(I_ELECTION, NULL);
228
229 } else {
230 controld_fsa_prepend(C_TIMER_POPPED, timer->fsa_input, NULL);
231 }
232
233 } else if ((timer == recheck_timer)
234 && (controld_globals.fsa_state != S_IDLE)) {
235 pcmk__debug("Discarding %s event in state: %s",
236 fsa_input2string(timer->fsa_input),
237 fsa_state2string(controld_globals.fsa_state));
238
239 } else if ((timer == finalization_timer)
240 && (controld_globals.fsa_state != S_FINALIZE_JOIN)) {
241 pcmk__debug("Discarding %s event in state: %s",
242 fsa_input2string(timer->fsa_input),
243 fsa_state2string(controld_globals.fsa_state));
244
245 } else if (timer->fsa_input != I_NULL) {
246 controld_fsa_append(C_TIMER_POPPED, timer->fsa_input, NULL);
247 }
248
249 controld_trigger_fsa();
250
251 return TRUE;
252 }
253
254 bool
255 controld_init_fsa_timers(void)
256 {
257 transition_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
258 integration_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
259 finalization_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
260 election_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
261 shutdown_escalation_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
262 wait_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
263 recheck_timer = pcmk__assert_alloc(1, sizeof(fsa_timer_t));
264
265 election_timer->source_id = 0;
266 election_timer->period_ms = 0;
267 election_timer->fsa_input = I_DC_TIMEOUT;
268 election_timer->callback = crm_timer_popped;
269 election_timer->log_error = FALSE;
270
271 transition_timer->source_id = 0;
272 transition_timer->period_ms = 0;
273 transition_timer->fsa_input = I_PE_CALC;
274 transition_timer->callback = crm_timer_popped;
275 transition_timer->log_error = FALSE;
276
277 integration_timer->source_id = 0;
278 integration_timer->period_ms = 0;
279 integration_timer->fsa_input = I_INTEGRATED;
280 integration_timer->callback = crm_timer_popped;
281 integration_timer->log_error = TRUE;
282
283 finalization_timer->source_id = 0;
284 finalization_timer->period_ms = 0;
285 finalization_timer->fsa_input = I_FINALIZED;
286 finalization_timer->callback = crm_timer_popped;
287 finalization_timer->log_error = FALSE;
288
289 /* We can't use I_FINALIZED here, because that creates a bug in the join
290 * process where a joining node can be stuck in S_PENDING while we think it
291 * is in S_NOT_DC. This created an infinite transition loop in which we
292 * continually send probes which the node NACKs because it's pending.
293 *
294 * If we have nodes where the cluster layer is active but the controller is
295 * not, we can avoid this causing an election/join loop, in the integration
296 * phase.
297 */
298 finalization_timer->fsa_input = I_ELECTION;
299
300 shutdown_escalation_timer->source_id = 0;
301 shutdown_escalation_timer->period_ms = 0;
302 shutdown_escalation_timer->fsa_input = I_STOP;
303 shutdown_escalation_timer->callback = crm_timer_popped;
304 shutdown_escalation_timer->log_error = TRUE;
305
306 wait_timer->source_id = 0;
307 wait_timer->period_ms = 2000;
308 wait_timer->fsa_input = I_NULL;
309 wait_timer->callback = crm_timer_popped;
310 wait_timer->log_error = FALSE;
311
312 recheck_timer->source_id = 0;
313 recheck_timer->period_ms = 0;
314 recheck_timer->fsa_input = I_PE_CALC;
315 recheck_timer->callback = crm_timer_popped;
316 recheck_timer->log_error = FALSE;
317
318 return TRUE;
319 }
320
321 /*!
322 * \internal
323 * \brief Configure timers based on the CIB
324 *
325 * \param[in,out] options Name/value pairs for configured options
326 */
327 void
328 controld_configure_fsa_timers(GHashTable *options)
329 {
330 const char *value = NULL;
331
332 // Election timer
333 value = g_hash_table_lookup(options, PCMK_OPT_DC_DEADTIME);
334 pcmk_parse_interval_spec(value, &(election_timer->period_ms));
335
336 // Integration timer
337 value = g_hash_table_lookup(options, PCMK_OPT_JOIN_INTEGRATION_TIMEOUT);
338 pcmk_parse_interval_spec(value, &(integration_timer->period_ms));
339
340 // Finalization timer
341 value = g_hash_table_lookup(options, PCMK_OPT_JOIN_FINALIZATION_TIMEOUT);
342 pcmk_parse_interval_spec(value, &(finalization_timer->period_ms));
343
344 // Shutdown escalation timer
345 value = g_hash_table_lookup(options, PCMK_OPT_SHUTDOWN_ESCALATION);
346 pcmk_parse_interval_spec(value, &(shutdown_escalation_timer->period_ms));
347 pcmk__debug("Shutdown escalation occurs if DC has not responded to request "
348 "in %ums",
349 shutdown_escalation_timer->period_ms);
350
351 // Transition timer
352 value = g_hash_table_lookup(options, PCMK_OPT_TRANSITION_DELAY);
353 pcmk_parse_interval_spec(value, &(transition_timer->period_ms));
354
355 // Recheck interval
356 value = g_hash_table_lookup(options, PCMK_OPT_CLUSTER_RECHECK_INTERVAL);
357 pcmk_parse_interval_spec(value, &recheck_interval_ms);
358 pcmk__debug("Re-run scheduler after %dms of inactivity",
359 recheck_interval_ms);
360 }
361
362 void
363 controld_free_fsa_timers(void)
364 {
365 controld_stop_timer(transition_timer);
366 controld_stop_timer(integration_timer);
367 controld_stop_timer(finalization_timer);
368 controld_stop_timer(election_timer);
369 controld_stop_timer(shutdown_escalation_timer);
370 controld_stop_timer(wait_timer);
371 controld_stop_timer(recheck_timer);
372
|
(1) Event path: |
Condition "_p", taking true branch. |
373 g_clear_pointer(&transition_timer, free);
|
(2) Event path: |
Condition "_p", taking true branch. |
374 g_clear_pointer(&integration_timer, free);
|
(3) Event path: |
Condition "_p", taking true branch. |
375 g_clear_pointer(&finalization_timer, free);
|
CID (unavailable; MK=63c39a7531598f31242ed8cf9dac9cee) (#4 of 7): Inconsistent C union access (INCONSISTENT_UNION_ACCESS): |
|
(4) Event assign_union_field: |
The union field "in" of "_pp" is written. |
|
(5) Event inconsistent_union_field_access: |
In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in". |
376 g_clear_pointer(&election_timer, free);
377 g_clear_pointer(&shutdown_escalation_timer, free);
378 g_clear_pointer(&wait_timer, free);
379 g_clear_pointer(&recheck_timer, free);
380 }
381
382 /*!
383 * \internal
384 * \brief Check whether the transition timer is started
385 * \return true if the transition timer is started, or false otherwise
386 */
387 bool
388 controld_is_started_transition_timer(void)
389 {
390 return (transition_timer->period_ms > 0)
391 && (transition_timer->source_id != 0);
392 }
393
394 /*!
395 * \internal
396 * \brief Start the recheck timer
397 */
398 void
399 controld_start_recheck_timer(void)
400 {
401 // Default to recheck interval configured in CIB (if any)
402 guint period_ms = recheck_interval_ms;
403
404 // If scheduler supplied a "recheck by" time, check whether that's sooner
405 if (controld_globals.transition_graph->recheck_by > 0) {
406 time_t diff_seconds = controld_globals.transition_graph->recheck_by
407 - time(NULL);
408
409 if (diff_seconds < 1) {
410 // We're already past the desired time
411 period_ms = 500;
412 } else {
413 period_ms = (guint) QB_MIN(G_MAXUINT, diff_seconds * 1000LL);
414 }
415
416 // Use "recheck by" only if it's sooner than interval from CIB
417 if (period_ms > recheck_interval_ms) {
418 period_ms = recheck_interval_ms;
419 }
420 }
421
422 if (period_ms > 0) {
423 recheck_timer->period_ms = period_ms;
424 controld_start_timer(recheck_timer);
425 }
426 }
427
428 /*!
429 * \internal
430 * \brief Start the wait timer
431 */
432 void
433 controld_start_wait_timer(void)
434 {
435 controld_start_timer(wait_timer);
436 }
437
438 /*!
439 * \internal
440 * \brief Stop the recheck timer
441 *
442 * \return true if the recheck timer was running, or false otherwise
443 */
444 bool
445 controld_stop_recheck_timer(void)
446 {
447 return controld_stop_timer(recheck_timer);
448 }
449
450 /*!
451 * \brief Get the transition timer's configured period
452 * \return The transition_timer's period
453 */
454 guint
455 controld_get_period_transition_timer(void)
456 {
457 return transition_timer->period_ms;
458 }
459
460 /*!
461 * \internal
462 * \brief Reset the election timer's counter to 0
463 */
464 void
465 controld_reset_counter_election_timer(void)
466 {
467 election_timer->counter = 0;
468 }
469
470 /*!
471 * \internal
472 * \brief Stop the transition timer
473 *
474 * \return true if the transition timer was running, or false otherwise
475 */
476 bool
477 controld_stop_transition_timer(void)
478 {
479 return controld_stop_timer(transition_timer);
480 }
481
482 /*!
483 * \internal
484 * \brief Start the transition timer
485 */
486 void
487 controld_start_transition_timer(void)
488 {
489 controld_start_timer(transition_timer);
490 }
491
492 /*!
493 * \internal
494 * \brief Start the countdown sequence for a shutdown
495 *
496 * \param[in] default_period_ms Period to use if the shutdown escalation
497 * timer's period is 0
498 */
499 void
500 controld_shutdown_start_countdown(guint default_period_ms)
501 {
502 if (shutdown_escalation_timer->period_ms == 0) {
503 shutdown_escalation_timer->period_ms = default_period_ms;
504 }
505
506 pcmk__notice("Initiating controller shutdown sequence " QB_XS " limit=%ums",
507 shutdown_escalation_timer->period_ms);
508 controld_start_timer(shutdown_escalation_timer);
509 }
510