1 /*
2 * Copyright 2004-2026 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU General Public License version 2
7 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8 */
9
10 #include <crm_internal.h>
11
12 #include <stdbool.h>
13 #include <sys/param.h>
14
15 #include <libxml/xpath.h> // xmlXPathObject, etc.
16
17 #include <crm/crm.h>
18 #include <crm/cib.h>
19 #include <crm/common/xml.h>
20
21 #include <pacemaker-controld.h>
22
23 /*!
24 * \internal
25 * \brief Action numbers of outside events processed in current update diff
26 *
27 * This table is to be used as a set. It should be empty when the transitioner
28 * begins processing a CIB update diff. It ensures that if there are multiple
29 * events (for example, "_last_0" and "_last_failure_0") for the same action,
30 * only one of them updates the failcount. Events that originate outside the
31 * cluster can't be confirmed, since they're not in the transition graph.
32 */
33 static GHashTable *outside_events = NULL;
34
35 /*!
36 * \internal
37 * \brief Empty the hash table containing action numbers of outside events
38 */
39 void
40 controld_remove_all_outside_events(void)
41 {
42 if (outside_events != NULL) {
43 g_hash_table_remove_all(outside_events);
44 }
45 }
46
47 /*!
48 * \internal
49 * \brief Destroy the hash table containing action numbers of outside events
50 */
51 void
52 controld_destroy_outside_events_table(void)
53 {
|
CID (unavailable; MK=166cbbfd1f8baca63599dd382aec0c2c) (#1 of 1): Inconsistent C union access (INCONSISTENT_UNION_ACCESS): |
|
(1) Event assign_union_field: |
The union field "in" of "_pp" is written. |
|
(2) Event inconsistent_union_field_access: |
In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in". |
54 g_clear_pointer(&outside_events, g_hash_table_destroy);
55 }
56
57 /*!
58 * \internal
59 * \brief Add an outside event's action number to a set
60 *
61 * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the
62 * event was not already in the set, or \p pcmk_rc_already otherwise.
63 */
64 static int
65 record_outside_event(gint action_num)
66 {
67 if (outside_events == NULL) {
68 outside_events = g_hash_table_new(NULL, NULL);
69 }
70
71 if (g_hash_table_add(outside_events, GINT_TO_POINTER(action_num))) {
72 return pcmk_rc_ok;
73 }
74 return pcmk_rc_already;
75 }
76
77 gboolean
78 fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node)
79 {
80 const char *target_uuid = NULL;
81 const char *router = NULL;
82 const char *router_uuid = NULL;
83 xmlNode *last_action = NULL;
84
85 GList *gIter = NULL;
86 GList *gIter2 = NULL;
87
88 if (graph == NULL || graph->complete) {
89 return FALSE;
90 }
91
92 gIter = graph->synapses;
93 for (; gIter != NULL; gIter = gIter->next) {
94 pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
95
96 if (pcmk__any_flags_set(synapse->flags,
97 pcmk__synapse_confirmed|pcmk__synapse_failed)) {
98 /* We've already been here */
99 continue;
100 }
101
102 gIter2 = synapse->actions;
103 for (; gIter2 != NULL; gIter2 = gIter2->next) {
104 pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
105
106 if ((action->type == pcmk__pseudo_graph_action)
107 || pcmk__is_set(action->flags, pcmk__graph_action_confirmed)) {
108 continue;
109 } else if (action->type == pcmk__cluster_graph_action) {
110 const char *task = pcmk__xe_get(action->xml, PCMK_XA_OPERATION);
111
112 if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
113 continue;
114 }
115 }
116
117 target_uuid = pcmk__xe_get(action->xml, PCMK__META_ON_NODE_UUID);
118 router = pcmk__xe_get(action->xml, PCMK__XA_ROUTER_NODE);
119 if (router) {
120 const pcmk__node_status_t *node =
121 pcmk__get_node(0, router, NULL,
122 pcmk__node_search_cluster_member);
123
124 if (node != NULL) {
125 router_uuid = node->xml_id;
126 }
127 }
128
129 if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) {
130 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
131 pcmk__set_synapse_flags(synapse, pcmk__synapse_failed);
132 last_action = action->xml;
133 stop_te_timer(action);
134 pcmk__update_graph(graph, action);
135
136 if (pcmk__is_set(synapse->flags, pcmk__synapse_executed)) {
137 pcmk__notice("Action %d (%s) was pending on %s (offline)",
138 action->id,
139 pcmk__xe_get(action->xml,
140 PCMK__XA_OPERATION_KEY),
141 down_node);
142 } else {
143 pcmk__info("Action %d (%s) is scheduled for %s (offline)",
144 action->id,
145 pcmk__xe_get(action->xml, PCMK__XA_OPERATION_KEY),
146 down_node);
147 }
148 }
149 }
150 }
151
152 if (last_action != NULL) {
153 pcmk__info("Node %s shutdown resulted in un-runnable actions",
154 down_node);
155 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
156 "Node failure", last_action);
157 return TRUE;
158 }
159
160 return FALSE;
161 }
162
163 /*!
164 * \internal
165 * \brief Update failure-related node attributes if warranted
166 *
167 * \param[in] event XML describing operation that (maybe) failed
168 * \param[in] event_node_uuid Node that event occurred on
169 * \param[in] rc Actual operation return code
170 * \param[in] target_rc Expected operation return code
171 * \param[in] do_update If TRUE, do update regardless of operation type
172 * \param[in] ignore_failures If TRUE, update last failure but not fail count
173 *
174 * \return TRUE if this was not a direct nack, success or lrm status refresh
175 */
176 static gboolean
177 update_failcount(const xmlNode *event, const char *event_node_uuid, int rc,
178 int target_rc, gboolean do_update, gboolean ignore_failures)
179 {
180 guint interval_ms = 0;
181
182 char *task = NULL;
183 char *rsc_id = NULL;
184
185 const char *value = NULL;
186 const char *id = pcmk__xe_get(event, PCMK__XA_OPERATION_KEY);
187 const char *on_uname = pcmk__node_name_from_uuid(event_node_uuid);
188 const char *origin = pcmk__xe_get(event, PCMK_XA_CRM_DEBUG_ORIGIN);
189
190 // Nothing needs to be done for success or status refresh
191 if (rc == target_rc) {
192 return FALSE;
193 } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) {
194 pcmk__debug("No update for %s (rc=%d) on %s: Old failure from lrm "
195 "status refresh",
196 id, rc, on_uname);
197 return FALSE;
198 }
199
200 /* Sanity check */
201 CRM_CHECK(on_uname != NULL, return TRUE);
202 CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms),
203 pcmk__err("Couldn't parse: %s", pcmk__xe_id(event)); goto bail);
204
205 /* Decide whether update is necessary and what value to use */
206 if ((interval_ms > 0)
207 || pcmk__str_eq(task, PCMK_ACTION_PROMOTE, pcmk__str_none)
208 || pcmk__str_eq(task, PCMK_ACTION_DEMOTE, pcmk__str_none)) {
209 do_update = TRUE;
210
211 } else if (pcmk__str_eq(task, PCMK_ACTION_START, pcmk__str_none)) {
212 do_update = TRUE;
213 value = pcmk__s(controld_globals.transition_graph->failed_start_offset,
214 PCMK_VALUE_INFINITY);
215
216 } else if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_none)) {
217 do_update = TRUE;
218 value = pcmk__s(controld_globals.transition_graph->failed_stop_offset,
219 PCMK_VALUE_INFINITY);
220 }
221
222 if (do_update) {
223 pcmk__attrd_query_pair_t *fail_pair = NULL;
224 pcmk__attrd_query_pair_t *last_pair = NULL;
225 char *fail_name = NULL;
226 char *last_name = NULL;
227 GList *attrs = NULL;
228
229 uint32_t opts = pcmk__node_attr_none;
230
231 char *now = pcmk__ttoa(time(NULL));
232
233 // Fail count will be either incremented or set to infinity
234 if (!pcmk_str_is_infinity(value)) {
235 value = PCMK_XA_VALUE "++";
236 }
237
238 if (g_hash_table_lookup(pcmk__remote_peer_cache, event_node_uuid)) {
239 opts |= pcmk__node_attr_remote;
240 }
241
242 pcmk__info("Updating %s for %s on %s after failed %s: rc=%d "
243 "(update=%s, time=%s)",
244 (ignore_failures? "last failure" : "failcount"),
245 rsc_id, on_uname, task, rc, value, now);
246
247 /* Update the fail count, if we're not ignoring failures */
248 if (!ignore_failures) {
249 fail_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t));
250
251 fail_name = pcmk__failcount_name(rsc_id, task, interval_ms);
252 fail_pair->name = fail_name;
253 fail_pair->value = value;
254 fail_pair->node = on_uname;
255
256 attrs = g_list_prepend(attrs, fail_pair);
257 }
258
259 /* Update the last failure time (even if we're ignoring failures,
260 * so that failure can still be detected and shown, e.g. by crm_mon)
261 */
262 last_pair = pcmk__assert_alloc(1, sizeof(pcmk__attrd_query_pair_t));
263
264 last_name = pcmk__lastfailure_name(rsc_id, task, interval_ms);
265 last_pair->name = last_name;
266 last_pair->value = now;
267 last_pair->node = on_uname;
268
269 attrs = g_list_prepend(attrs, last_pair);
270
271 update_attrd_list(attrs, opts);
272
273 free(fail_name);
274 free(fail_pair);
275
276 free(last_name);
277 free(last_pair);
278 g_list_free(attrs);
279
280 free(now);
281 }
282
283 bail:
284 free(rsc_id);
285 free(task);
286 return TRUE;
287 }
288
289 pcmk__graph_action_t *
290 controld_get_action(int id)
291 {
292 for (GList *item = controld_globals.transition_graph->synapses;
293 item != NULL; item = item->next) {
294 pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) item->data;
295
296 for (GList *item2 = synapse->actions; item2; item2 = item2->next) {
297 pcmk__graph_action_t *action = (pcmk__graph_action_t *) item2->data;
298
299 if (action->id == id) {
300 return action;
301 }
302 }
303 }
304 return NULL;
305 }
306
307 pcmk__graph_action_t *
308 get_cancel_action(const char *id, const char *node)
309 {
310 GList *gIter = NULL;
311 GList *gIter2 = NULL;
312
313 gIter = controld_globals.transition_graph->synapses;
314 for (; gIter != NULL; gIter = gIter->next) {
315 pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
316
317 gIter2 = synapse->actions;
318 for (; gIter2 != NULL; gIter2 = gIter2->next) {
319 const char *task = NULL;
320 const char *target = NULL;
321 pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
322
323 task = pcmk__xe_get(action->xml, PCMK_XA_OPERATION);
324 if (!pcmk__str_eq(PCMK_ACTION_CANCEL, task, pcmk__str_casei)) {
325 continue;
326 }
327
328 task = pcmk__xe_get(action->xml, PCMK__XA_OPERATION_KEY);
329 if (!pcmk__str_eq(task, id, pcmk__str_casei)) {
330 continue;
331 }
332
333 target = pcmk__xe_get(action->xml, PCMK__META_ON_NODE_UUID);
334 if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) {
335 pcmk__trace("Wrong node %s for %s on %s", target, id, node);
336 continue;
337 }
338
339 pcmk__trace("Found %s on %s", id, node);
340 return action;
341 }
342 }
343
344 return NULL;
345 }
346
347 bool
348 confirm_cancel_action(const char *id, const char *node_id)
349 {
350 const char *op_key = NULL;
351 const char *node_name = NULL;
352 pcmk__graph_action_t *cancel = get_cancel_action(id, node_id);
353
354 if (cancel == NULL) {
355 return FALSE;
356 }
357 op_key = pcmk__xe_get(cancel->xml, PCMK__XA_OPERATION_KEY);
358 node_name = pcmk__xe_get(cancel->xml, PCMK__META_ON_NODE);
359
360 stop_te_timer(cancel);
361 te_action_confirmed(cancel, controld_globals.transition_graph);
362
363 pcmk__info("Cancellation of %s on %s confirmed (action %d)", op_key,
364 node_name, cancel->id);
365 return TRUE;
366 }
367
368 /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
369 #define XPATH_DOWNED "//" PCMK__XE_DOWNED \
370 "/" PCMK_XE_NODE "[@" PCMK_XA_ID "='%s']"
371
372 /*!
373 * \brief Find a transition event that would have made a specified node down
374 *
375 * \param[in] target UUID of node to match
376 *
377 * \return Matching event if found, NULL otherwise
378 */
379 pcmk__graph_action_t *
380 match_down_event(const char *target)
381 {
382 pcmk__graph_action_t *match = NULL;
383 xmlXPathObject *xpath_ret = NULL;
384 GList *gIter, *gIter2;
385
386 char *xpath = pcmk__assert_asprintf(XPATH_DOWNED, target);
387
388 for (gIter = controld_globals.transition_graph->synapses;
389 gIter != NULL && match == NULL;
390 gIter = gIter->next) {
391
392 for (gIter2 = ((pcmk__graph_synapse_t * ) gIter->data)->actions;
393 gIter2 != NULL && match == NULL;
394 gIter2 = gIter2->next) {
395
396 match = (pcmk__graph_action_t *) gIter2->data;
397 if (pcmk__is_set(match->flags, pcmk__graph_action_executed)) {
398 xpath_ret = pcmk__xpath_search(match->xml->doc, xpath);
399 if (pcmk__xpath_num_results(xpath_ret) == 0) {
400 match = NULL;
401 }
402 xmlXPathFreeObject(xpath_ret);
403 } else {
404 // Only actions that were actually started can match
405 match = NULL;
406 }
407 }
408 }
409
410 free(xpath);
411
412 if (match != NULL) {
413 pcmk__debug("Shutdown action %d (%s) found for node %s", match->id,
414 pcmk__xe_get(match->xml, PCMK__XA_OPERATION_KEY), target);
415 } else {
416 pcmk__debug("No reason to expect node %s to be down", target);
417 }
418 return match;
419 }
420
421 void
422 process_graph_event(xmlNode *event, const char *event_node)
423 {
424 int rc = -1; // Actual result
425 int target_rc = -1; // Expected result
426 int status = -1; // Executor status
427 int callid = -1; // Executor call ID
428 int transition_num = -1; // Transition number
429 int action_num = -1; // Action number within transition
430 char *update_te_uuid = NULL;
431 bool ignore_failures = FALSE;
432 const char *id = NULL;
433 const char *desc = NULL;
434 const char *magic = NULL;
435 const char *uname = NULL;
436
437 pcmk__assert(event != NULL);
438
439 /*
440 <lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
441 */
442
443 magic = pcmk__xe_get(event, PCMK__XA_TRANSITION_KEY);
444 if (magic == NULL) {
445 /* non-change */
446 return;
447 }
448
449 pcmk__xe_get_int(event, PCMK__XA_OP_STATUS, &status);
450 if (status == PCMK_EXEC_PENDING) {
451 return;
452 }
453
454 id = pcmk__xe_get(event, PCMK__XA_OPERATION_KEY);
455 pcmk__xe_get_int(event, PCMK__XA_RC_CODE, &rc);
456 pcmk__xe_get_int(event, PCMK__XA_CALL_ID, &callid);
457
458 rc = pcmk__effective_rc(rc);
459
460 if (decode_transition_key(magic, &update_te_uuid, &transition_num,
461 &action_num, &target_rc) == FALSE) {
462 // decode_transition_key() already logged the bad key
463 pcmk__err("Can't process action %s result: Incompatible versions? "
464 QB_XS " call-id=%d",
465 id, callid);
466 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
467 "Bad event", event);
468 return;
469 }
470
471 if (transition_num == -1) {
472 // E.g. crm_resource --fail
473 if (record_outside_event(action_num) != pcmk_rc_ok) {
474 pcmk__debug("Outside event with transition key '%s' has already "
475 "been processed",
476 magic);
477 goto bail;
478 }
479 desc = "initiated outside of the cluster";
480 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
481 "Unexpected event", event);
482
483 } else if ((action_num < 0)
484 || !pcmk__str_eq(update_te_uuid, controld_globals.te_uuid,
485 pcmk__str_none)) {
486 desc = "initiated by a different DC";
487 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
488 "Foreign event", event);
489
490 } else if ((controld_globals.transition_graph->id != transition_num)
491 || controld_globals.transition_graph->complete) {
492
493 // Action is not from currently active transition
494
495 guint interval_ms = 0;
496
497 if (parse_op_key(id, NULL, NULL, &interval_ms)
498 && (interval_ms != 0)) {
499 /* Recurring actions have the transition number they were first
500 * scheduled in.
501 */
502
503 if (status == PCMK_EXEC_CANCELLED) {
504 confirm_cancel_action(id, get_node_id(event));
505 goto bail;
506 }
507
508 desc = "arrived after initial scheduling";
509 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
510 "Change in recurring result", event);
511
512 } else if (controld_globals.transition_graph->id != transition_num) {
513 desc = "arrived really late";
514 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
515 "Old event", event);
516 } else {
517 desc = "arrived late";
518 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
519 "Inactive graph", event);
520 }
521
522 } else {
523 // Event is result of an action from currently active transition
524 pcmk__graph_action_t *action = controld_get_action(action_num);
525
526 if (action == NULL) {
527 // Should never happen
528 desc = "unknown";
529 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
530 "Unknown event", event);
531
532 } else if (pcmk__is_set(action->flags, pcmk__graph_action_confirmed)) {
533 /* Nothing further needs to be done if the action has already been
534 * confirmed. This can happen e.g. when processing both an
535 * "xxx_last_0" or "xxx_last_failure_0" record as well as the main
536 * history record, which would otherwise result in incorrectly
537 * bumping the fail count twice.
538 */
539 pcmk__log_xml_debug(event, "Event already confirmed:");
540 goto bail;
541
542 } else {
543 /* An action result needs to be confirmed.
544 * (This is the only case where desc == NULL.)
545 */
546
547 if (pcmk__str_eq(crm_meta_value(action->params, PCMK_META_ON_FAIL),
548 PCMK_VALUE_IGNORE, pcmk__str_casei)) {
549 ignore_failures = TRUE;
550
551 } else if (rc != target_rc) {
552 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
553 }
554
555 stop_te_timer(action);
556 te_action_confirmed(action, controld_globals.transition_graph);
557
558 if (pcmk__is_set(action->flags, pcmk__graph_action_failed)) {
559 abort_transition(action->synapse->priority + 1,
560 pcmk__graph_restart, "Event failed", event);
561 }
562 }
563 }
564
565 if (id == NULL) {
566 id = "unknown action";
567 }
568 uname = pcmk__xe_get(event, PCMK__META_ON_NODE);
569 if (uname == NULL) {
570 uname = "unknown node";
571 }
572
573 if (status == PCMK_EXEC_INVALID) {
574 // We couldn't attempt the action
575 pcmk__info("Transition %d action %d (%s on %s): %s", transition_num,
576 action_num, id, uname, pcmk_exec_status_str(status));
577
578 } else if (desc && update_failcount(event, event_node, rc, target_rc,
579 (transition_num == -1), FALSE)) {
580 pcmk__notice("Transition %d action %d (%s on %s): expected '%s' but "
581 "got '%s' "
582 QB_XS " target-rc=%d rc=%d call-id=%d event='%s'",
583 transition_num, action_num, id, uname,
584 crm_exit_str(target_rc), crm_exit_str(rc),
585 target_rc, rc, callid, desc);
586
587 } else if (desc) {
588 pcmk__info("Transition %d action %d (%s on %s): %s "
589 QB_XS " rc=%d target-rc=%d call-id=%d",
590 transition_num, action_num, id, uname,
591 desc, rc, target_rc, callid);
592
593 } else if (rc == target_rc) {
594 pcmk__info("Transition %d action %d (%s on %s) confirmed: %s "
595 QB_XS " rc=%d call-id=%d",
596 transition_num, action_num, id, uname, crm_exit_str(rc),
597 rc, callid);
598
599 } else {
600 update_failcount(event, event_node, rc, target_rc,
601 (transition_num == -1), ignore_failures);
602 pcmk__notice("Transition %d action %d (%s on %s): expected '%s' but "
603 "got '%s' " QB_XS " target-rc=%d rc=%d call-id=%d",
604 transition_num, action_num, id, uname,
605 crm_exit_str(target_rc), crm_exit_str(rc),
606 target_rc, rc, callid);
607 }
608
609 bail:
610 free(update_te_uuid);
611 }
612