1    	/*
2    	 * Copyright 2004-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <stdbool.h>
13   	
14   	#include <crm/crm.h>
15   	#include <crm/common/xml.h>
16   	#include <crm/stonith-ng.h>
17   	#include <crm/fencing/internal.h>
18   	
19   	#include <pacemaker-controld.h>
20   	
21   	static void fencing_history_synced(stonith_t *st, stonith_event_t *st_event);
22   	
23   	#define DEFAULT_FENCING_MAX_ATTEMPTS 10
24   	
25   	static bool fence_reaction_panic = false;
26   	static unsigned long int fencing_max_attempts = DEFAULT_FENCING_MAX_ATTEMPTS;
27   	
28   	/*
29   	 * Fencing failure counting
30   	 *
31   	 * We don't want to get stuck in a permanent fencing loop. Keep track of the
32   	 * number of fencing failures for each target node, and the most we'll restart a
33   	 * transition for.
34   	 */
35   	static GHashTable *fencing_fail_counts = NULL;
36   	
37   	/*!
38   	 * \internal
39   	 * \brief Update max fencing attempts before giving up
40   	 *
41   	 * \param[in] value  New max fencing attempts
42   	 */
43   	static void
44   	update_fencing_max_attempts(const char *value)
45   	{
46   	    int score = 0;
47   	    int rc = pcmk_parse_score(value, &score, DEFAULT_FENCING_MAX_ATTEMPTS);
48   	
49   	    // The option validator ensures invalid values shouldn't be possible
50   	    CRM_CHECK((rc == pcmk_rc_ok) && (score > 0), return);
51   	
52   	    if (fencing_max_attempts != score) {
53   	        pcmk__debug("Maximum fencing attempts per transition is now %d "
54   	                    "(was %lu)", score, fencing_max_attempts);
55   	    }
56   	    fencing_max_attempts = score;
57   	}
58   	
59   	/*!
60   	 * \internal
61   	 * \brief Configure reaction to notification of local node being fenced
62   	 *
63   	 * \param[in] reaction_s  Reaction type
64   	 */
65   	static void
66   	set_fence_reaction(const char *reaction_s)
67   	{
68   	    if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
69   	        fence_reaction_panic = true;
70   	
71   	    } else {
72   	        if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) {
73   	            pcmk__warn("Invalid value '%s' for " PCMK_OPT_FENCING_REACTION
74   	                       ", using 'stop'",
75   	                       reaction_s);
76   	        }
77   	        fence_reaction_panic = false;
78   	    }
79   	}
80   	
81   	/*!
82   	 * \internal
83   	 * \brief Configure fencing options based on the CIB
84   	 *
85   	 * \param[in,out] options  Name/value pairs for configured options
86   	 */
87   	void
88   	controld_configure_fencing(GHashTable *options)
89   	{
90   	    const char *value = NULL;
91   	
92   	    value = g_hash_table_lookup(options, PCMK_OPT_FENCING_REACTION);
93   	    set_fence_reaction(value);
94   	
95   	    value = g_hash_table_lookup(options, PCMK_OPT_FENCING_MAX_ATTEMPTS);
96   	    update_fencing_max_attempts(value);
97   	}
98   	
99   	static bool
100  	too_many_fencing_failures(const char *target)
101  	{
102  	    GHashTableIter iter;
103  	    gpointer value = NULL;
104  	
105  	    if (fencing_fail_counts == NULL) {
106  	        return false;
107  	    }
108  	
109  	    if (target == NULL) {
110  	        g_hash_table_iter_init(&iter, fencing_fail_counts);
111  	        while (g_hash_table_iter_next(&iter, (gpointer *) &target, &value)) {
112  	            if (GPOINTER_TO_INT(value) >= fencing_max_attempts) {
113  	                goto too_many;
114  	            }
115  	        }
116  	
117  	    } else if (g_hash_table_lookup_extended(fencing_fail_counts, target, NULL,
118  	                                            &value)
119  	               && (GPOINTER_TO_INT(value) >= fencing_max_attempts)) {
120  	        goto too_many;
121  	    }
122  	    return false;
123  	
124  	too_many:
125  	    pcmk__warn("Too many failures (%d) to fence %s, giving up",
126  	               GPOINTER_TO_INT(value), target);
127  	    return true;
128  	}
129  	
130  	/*!
131  	 * \internal
132  	 * \brief Reset the count of failed fencing operations for a node
133  	 *
134  	 * \param[in] target  Name of node whose count to reset, or \c NULL to reset all
135  	 */
136  	void
137  	controld_reset_fencing_fail_count(const char *target)
138  	{
139  	    if (fencing_fail_counts == NULL) {
140  	        return;
141  	    }
142  	
143  	    if (target != NULL) {
144  	        g_hash_table_remove(fencing_fail_counts, target);
145  	
146  	    } else {
147  	        g_hash_table_remove_all(fencing_fail_counts);
148  	    }
149  	}
150  	
151  	static void
152  	increment_fencing_fail_count(const char *target)
153  	{
154  	    gpointer key = NULL;
155  	    gpointer value = NULL;
156  	
157  	    if (fencing_fail_counts == NULL) {
158  	        fencing_fail_counts = pcmk__strikey_table(free, NULL);
159  	    }
160  	
161  	    if (g_hash_table_lookup_extended(fencing_fail_counts, target, &key,
162  	                                     &value)) {
163  	        gpointer new_value = GINT_TO_POINTER(GPOINTER_TO_INT(value) + 1);
164  	
165  	        // Increment value in the table without freeing key
166  	        g_hash_table_steal(fencing_fail_counts, key);
167  	        g_hash_table_insert(fencing_fail_counts, key, new_value);
168  	
169  	    } else {
170  	        g_hash_table_insert(fencing_fail_counts, pcmk__str_copy(target),
171  	                            GINT_TO_POINTER(1));
172  	    }
173  	}
174  	
175  	/* end fencing fail count functions */
176  	
177  	
178  	static void
179  	cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
180  	                    void *user_data)
181  	{
182  	    if (rc < pcmk_ok) {
183  	        pcmk__err("Fencing update %d for %s: failed - %s (%d)",
184  	                  call_id, (char *)user_data, pcmk_strerror(rc), rc);
185  	        pcmk__log_xml_warn(msg, "Failed update");
186  	        abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown,
187  	                         "CIB update failed", NULL);
188  	
189  	    } else {
190  	        pcmk__info("Fencing update %d for %s: complete", call_id,
191  	                   (const char *) user_data);
192  	    }
193  	}
194  	
195  	/*!
196  	 * \internal
197  	 * \brief Update a fencing target's node state
198  	 *
199  	 * \param[in] target         Node that was successfully fenced
200  	 * \param[in] target_xml_id  CIB XML ID of target
201  	 */
202  	static void
203  	update_node_state_after_fencing(const char *target, const char *target_xml_id)
204  	{
205  	    int rc = pcmk_ok;
206  	    pcmk__node_status_t *peer = NULL;
207  	    xmlNode *node_state = NULL;
208  	
209  	    /* We (usually) rely on the membership layer to do
210  	     * controld_node_update_cluster, and the peer status callback to do
211  	     * controld_node_update_peer, because the node might have already rejoined
212  	     * before we get the fencing result here.
213  	     */
214  	    uint32_t flags = controld_node_update_join|controld_node_update_expected;
215  	
216  	    CRM_CHECK((target != NULL) && (target_xml_id != NULL), return);
217  	
218  	    // Ensure target is cached
219  	    peer = pcmk__get_node(0, target, target_xml_id, pcmk__node_search_any);
220  	    CRM_CHECK(peer != NULL, return);
221  	
222  	    if (peer->state == NULL) {
223  	        /* Usually, we rely on the membership layer to update the cluster state
224  	         * in the CIB. However, if the node has never been seen, do it here, so
225  	         * the node is not considered unclean.
226  	         */
227  	        flags |= controld_node_update_cluster;
228  	    }
229  	
230  	    if (peer->xml_id == NULL) {
231  	        pcmk__info("Recording XML ID '%s' for node '%s'", target_xml_id,
232  	                   target);
233  	        peer->xml_id = pcmk__str_copy(target_xml_id);
234  	    }
235  	
236  	    crmd_peer_down(peer, TRUE);
237  	
238  	    node_state = create_node_state_update(peer, flags, NULL, __func__);
239  	    pcmk__xe_set(node_state, PCMK_XA_ID, target_xml_id);
240  	
241  	    if (pcmk__is_set(peer->flags, pcmk__node_status_remote)) {
242  	        char *now_s = pcmk__ttoa(time(NULL));
243  	
244  	        pcmk__xe_set(node_state, PCMK__XA_NODE_FENCED, now_s);
245  	        free(now_s);
246  	    }
247  	
248  	    rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
249  	                                                 PCMK_XE_STATUS, node_state,
250  	                                                 cib_can_create);
251  	    pcmk__xml_free(node_state);
252  	
253  	    pcmk__debug("Updating node state for %s after fencing (call %d)", target,
254  	                rc);
255  	    fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
256  	
257  	    // Delete node's resource history from CIB
258  	    controld_delete_node_history(peer->name, false, cib_none);
259  	
260  	    // Ask attribute manager to delete node's transient attributes
261  	    // @TODO: This is the only call to controld_purge_node_attrs that doesn't
262  	    //        want to also purge the node from the caches.  Why?
263  	    controld_purge_node_attrs(peer->name, false);
264  	}
265  	
266  	/*!
267  	 * \internal
268  	 * \brief Abort transition due to fencing failure
269  	 *
270  	 * \param[in] abort_action  Whether to restart or stop transition
271  	 * \param[in] target        Don't restart if this node has too many failures
272  	 *                          (\c NULL to check if any node has too many failures)
273  	 * \param[in] reason        Log this fencing action XML as abort reason (can be
274  	 *                          \c NULL)
275  	 */
276  	static void
277  	abort_for_fencing_failure(enum pcmk__graph_next abort_action,
278  	                          const char *target, const xmlNode *reason)
279  	{
280  	    /* If fencing repeatedly fails, we eventually give up on starting a new
281  	     * transition for that reason.
282  	     */
283  	    if ((abort_action != pcmk__graph_wait)
284  	        && too_many_fencing_failures(target)) {
285  	
286  	        abort_action = pcmk__graph_wait;
287  	    }
288  	    abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed",
289  	                     reason);
290  	}
291  	
292  	
293  	/*
294  	 * Fencing cleanup list
295  	 *
296  	 * If the DC is fenced, proper notifications might not go out. The fencing
297  	 * cleanup list allows the cluster to (re-)send notifications once a new DC is
298  	 * elected.
299  	 */
300  	
301  	static GList *fencing_cleanup_list = NULL;
302  	
303  	/*!
304  	 * \internal
305  	 * \brief Add a node to the fencing cleanup list
306  	 *
307  	 * \param[in] target  Name of node to add
308  	 */
309  	static void
310  	add_fencing_cleanup(const char *target)
311  	{
312  	    fencing_cleanup_list = g_list_append(fencing_cleanup_list,
313  	                                         pcmk__str_copy(target));
314  	}
315  	
316  	/*!
317  	 * \internal
318  	 * \brief Remove a node from the fencing cleanup list
319  	 *
320  	 * \param[in] Name of node to remove
321  	 */
322  	void
323  	controld_remove_fencing_cleanup(const char *target)
324  	{
325  	    GList *iter = fencing_cleanup_list;
326  	
327  	    while (iter != NULL) {
328  	        GList *tmp = iter;
329  	        char *iter_name = tmp->data;
330  	
331  	        iter = iter->next;
332  	        if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
333  	            pcmk__trace("Removing %s from the cleanup list", iter_name);
334  	            fencing_cleanup_list = g_list_delete_link(fencing_cleanup_list,
335  	                                                      tmp);
336  	            free(iter_name);
337  	        }
338  	    }
339  	}
340  	
341  	/*!
342  	 * \internal
343  	 * \brief Purge all entries from the fencing cleanup list
344  	 */
345  	void
346  	controld_purge_fencing_cleanup(void)
347  	{
348  	    for (GList *iter = fencing_cleanup_list; iter != NULL; iter = iter->next) {
349  	        char *target = iter->data;
350  	
351  	        pcmk__info("Purging %s from fencing cleanup list", target);
352  	        free(target);
353  	    }
354  	
355  	    g_clear_pointer(&fencing_cleanup_list, g_list_free);
356  	}
357  	
358  	/*!
359  	 * \internal
360  	 * \brief Send fencing updates for all entries in cleanup list, then purge it
361  	 */
362  	void
363  	controld_execute_fencing_cleanup(void)
364  	{
365  	    for (GList *iter = fencing_cleanup_list; iter != NULL; iter = iter->next) {
366  	        char *target = iter->data;
367  	        pcmk__node_status_t *target_node =
368  	            pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member);
369  	        const char *uuid = pcmk__cluster_get_xml_id(target_node);
370  	
371  	        pcmk__notice("Marking %s, target of a previous fencing action, as "
372  	                     "clean", target);
373  	        update_node_state_after_fencing(target, uuid);
374  	        free(target);
375  	    }
376  	
377  	    g_clear_pointer(&fencing_cleanup_list, g_list_free);
378  	}
379  	
380  	/* end fencing cleanup list functions */
381  	
382  	
383  	/* Fencer API client
384  	 *
385  	 * Functions that need to interact directly with the fencer via its API
386  	 */
387  	
388  	static stonith_t *fencer_api = NULL;
389  	static mainloop_timer_t *controld_fencer_connect_timer = NULL;
390  	static char *te_client_id = NULL;
391  	
392  	static bool
393  	fail_incompletable_fencing(pcmk__graph_t *graph)
394  	{
395  	    GList *lpc = NULL;
396  	    const char *task = NULL;
397  	    xmlNode *last_action = NULL;
398  	
399  	    if (graph == NULL) {
400  	        return false;
401  	    }
402  	
403  	    for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
404  	        GList *lpc2 = NULL;
405  	        pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
406  	
407  	        if (pcmk__is_set(synapse->flags, pcmk__synapse_confirmed)) {
408  	            continue;
409  	        }
410  	
411  	        for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
412  	            pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
413  	
414  	            if ((action->type != pcmk__cluster_graph_action)
415  	                || pcmk__is_set(action->flags, pcmk__graph_action_confirmed)) {
416  	                continue;
417  	            }
418  	
419  	            task = pcmk__xe_get(action->xml, PCMK_XA_OPERATION);
420  	            if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
421  	                pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
422  	                last_action = action->xml;
423  	                pcmk__update_graph(graph, action);
424  	                pcmk__notice("Failing action %d (%s): fencer terminated",
425  	                             action->id, pcmk__xe_id(action->xml));
426  	            }
427  	        }
428  	    }
429  	
430  	    if (last_action != NULL) {
431  	        pcmk__warn("Fencing failure resulted in unrunnable actions");
432  	        abort_for_fencing_failure(pcmk__graph_restart, NULL, last_action);
433  	        return true;
434  	    }
435  	
436  	    return false;
437  	}
438  	
439  	static void
440  	destroy_fencer_connection(stonith_t *st, stonith_event_t *e)
441  	{
442  	    controld_cleanup_fencing_history_sync(st, false);
443  	
444  	    if (pcmk__is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
445  	        pcmk__err("Lost fencer connection (will attempt to reconnect)");
446  	        if (!mainloop_timer_running(controld_fencer_connect_timer)) {
447  	            mainloop_timer_start(controld_fencer_connect_timer);
448  	        }
449  	    } else {
450  	        pcmk__info("Disconnected from fencer");
451  	    }
452  	
453  	    if (fencer_api != NULL) {
454  	        /* the client API won't properly reconnect notifications
455  	         * if they are still in the table - so remove them
456  	         */
457  	        if (fencer_api->state != stonith_disconnected) {
458  	            fencer_api->cmds->disconnect(st);
459  	        }
460  	        fencer_api->cmds->remove_notification(fencer_api, NULL);
461  	    }
462  	
463  	    if (AM_I_DC) {
464  	        fail_incompletable_fencing(controld_globals.transition_graph);
465  	        trigger_graph();
466  	    }
467  	}
468  	
469  	/*!
470  	 * \internal
471  	 * \brief Handle an event notification from the fencing API
472  	 *
473  	 * \param[in] st     Fencing API connection (ignored)
474  	 * \param[in] event  Fencing API event notification
475  	 */
476  	static void
477  	handle_fence_notification(stonith_t *st, stonith_event_t *event)
478  	{
479  	    bool succeeded = true;
480  	    const char *executioner = "the cluster";
481  	    const char *client = "a client";
482  	    const char *reason = NULL;
483  	    int exec_status;
484  	
485  	    if (te_client_id == NULL) {
486  	        te_client_id = pcmk__assert_asprintf("%s.%lu", crm_system_name,
487  	                                             (unsigned long) getpid());
488  	    }
489  	
490  	    if (event == NULL) {
491  	        pcmk__err("Notify data not found");
492  	        return;
493  	    }
494  	
495  	    if (event->executioner != NULL) {
496  	        executioner = event->executioner;
497  	    }
498  	    if (event->client_origin != NULL) {
499  	        client = event->client_origin;
500  	    }
501  	
502  	    exec_status = stonith__event_execution_status(event);
503  	    if ((stonith__event_exit_status(event) != CRM_EX_OK)
504  	        || (exec_status != PCMK_EXEC_DONE)) {
505  	        succeeded = false;
506  	        if (exec_status == PCMK_EXEC_DONE) {
507  	            exec_status = PCMK_EXEC_ERROR;
508  	        }
509  	    }
510  	    reason = stonith__event_exit_reason(event);
511  	
512  	    crmd_alert_fencing_op(event);
513  	
514  	    if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
515  	        // Unfencing doesn't need special handling, just a log message
516  	        if (succeeded) {
517  	            pcmk__notice("%s was unfenced by %s at the request of %s@%s",
518  	                         event->target, executioner, client, event->origin);
519  	        } else {
520  	            pcmk__err("Unfencing of %s by %s failed (%s%s%s) with exit status "
521  	                      "%d",
522  	                      event->target, executioner,
523  	                      pcmk_exec_status_str(exec_status),
524  	                      ((reason == NULL)? "" : ": "),
525  	                      pcmk__s(reason, ""), stonith__event_exit_status(event));
526  	        }
527  	        return;
528  	    }
529  	
530  	    if (succeeded && controld_is_local_node(event->target)) {
531  	        /* We were notified of our own fencing. Most likely, either fencing was
532  	         * misconfigured, or fabric fencing that doesn't cut cluster
533  	         * communication is in use.
534  	         *
535  	         * Either way, shutting down the local host is a good idea, to require
536  	         * administrator intervention. Also, other nodes would otherwise likely
537  	         * set our status to lost because of the fencing callback and discard
538  	         * our subsequent election votes as "not part of our cluster".
539  	         */
540  	        pcmk__crit("We were allegedly just fenced by %s for %s!", executioner,
541  	                   event->origin); // Dumps blackbox if enabled
542  	        if (fence_reaction_panic) {
543  	            pcmk__panic("Notified of own fencing");
544  	        } else {
545  	            crm_exit(CRM_EX_FATAL);
546  	        }
547  	        return; // Should never get here
548  	    }
549  	
550  	    /* Update the count of fencing failures for this target, in case we become
551  	     * DC later. The current DC has already updated its fail count in
552  	     * fencing_cb().
553  	     */
554  	    if (!AM_I_DC) {
555  	        if (succeeded) {
556  	            controld_reset_fencing_fail_count(event->target);
557  	        } else {
558  	            increment_fencing_fail_count(event->target);
559  	        }
560  	    }
561  	
562  	    pcmk__notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
563  	                 "%s%s%s%s " QB_XS " event=%s",
564  	                 event->target, (succeeded? "" : " not"), event->action,
565  	                 executioner, client, event->origin,
566  	                 (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
567  	                 ((reason != NULL)? " (" : ""), pcmk__s(reason, ""),
568  	                 ((reason != NULL)? ")" : ""), event->id);
569  	
570  	    if (succeeded) {
571  	        const uint32_t flags = pcmk__node_search_any
572  	                               |pcmk__node_search_cluster_cib;
573  	
574  	        pcmk__node_status_t *peer = pcmk__search_node_caches(0, event->target,
575  	                                                             NULL, flags);
576  	        const char *uuid = NULL;
577  	
578  	        if (peer == NULL) {
579  	            return;
580  	        }
581  	
582  	        uuid = pcmk__cluster_get_xml_id(peer);
583  	
584  	        if (AM_I_DC) {
585  	            /* The DC always sends updates */
586  	            update_node_state_after_fencing(event->target, uuid);
587  	
588  	            /* @TODO Ideally, at this point, we'd check whether the fenced node
589  	             * hosted any guest nodes, and call remote_node_down() for them.
590  	             * Unfortunately, the controller doesn't have a simple, reliable way
591  	             * to map hosts to guests. It might be possible to track this in the
592  	             * peer cache via refresh_remote_nodes(). For now, we rely on the
593  	             * scheduler creating fence pseudo-events for the guests.
594  	             */
595  	
596  	            if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
597  	                /* Abort the current transition if it wasn't the cluster that
598  	                 * initiated fencing.
599  	                 */
600  	                pcmk__info("External fencing operation from %s fenced %s",
601  	                           client, event->target);
602  	                abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
603  	                                 "External Fencing Operation", NULL);
604  	            }
605  	
606  	        } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
607  	                                pcmk__str_null_matches|pcmk__str_casei)
608  	                   && !pcmk__is_set(peer->flags, pcmk__node_status_remote)) {
609  	            // Assume the target was our DC if we don't currently have one
610  	
611  	            if (controld_globals.dc_name != NULL) {
612  	                pcmk__notice("Fencing target %s was our DC", event->target);
613  	            } else {
614  	                pcmk__notice("Fencing target %s may have been our DC",
615  	                             event->target);
616  	            }
617  	
618  	            /* Given the CIB resyncing that occurs around elections,
619  	             * have one node update the CIB now and, if the new DC is different,
620  	             * have them do so too after the election
621  	             */
622  	            if (controld_is_local_node(event->executioner)) {
623  	                update_node_state_after_fencing(event->target, uuid);
624  	            }
625  	            add_fencing_cleanup(event->target);
626  	        }
627  	
628  	        /* If the target is a remote node, and we host its connection,
629  	         * immediately fail all monitors so it can be recovered quickly.
630  	         * The connection won't necessarily drop when a remote node is fenced,
631  	         * so the failure might not otherwise be detected until the next poke.
632  	         */
633  	        if (pcmk__is_set(peer->flags, pcmk__node_status_remote)) {
634  	            remote_ra_fail(event->target);
635  	        }
636  	
637  	        crmd_peer_down(peer, TRUE);
638  	     }
639  	}
640  	
641  	/*!
642  	 * \brief Connect to fencer
643  	 *
644  	 * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
645  	 *
646  	 * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
647  	 * \note If user_data is NULL, this will wait 2s between attempts, for up to
648  	 *       30 attempts, meaning the controller could be blocked as long as 58s.
649  	 */
650  	gboolean
651  	controld_timer_fencer_connect(gpointer user_data)
652  	{
653  	    int rc = pcmk_ok;
654  	
655  	    if (fencer_api == NULL) {
656  	        fencer_api = stonith__api_new();
657  	        if (fencer_api == NULL) {
658  	            pcmk__err("Could not connect to fencer: API memory allocation "
659  	                      "failed");
660  	            return G_SOURCE_REMOVE;
661  	        }
662  	    }
663  	
664  	    if (fencer_api->state != stonith_disconnected) {
665  	        pcmk__trace("Already connected to fencer, no need to retry");
666  	        return G_SOURCE_REMOVE;
667  	    }
668  	
669  	    if (user_data == NULL) {
670  	        // Blocking (retry failures now until successful)
671  	        rc = stonith__api_connect_retry(fencer_api, crm_system_name, 30);
672  	        if (rc != pcmk_rc_ok) {
673  	            pcmk__err("Could not connect to fencer in 30 attempts: %s "
674  	                      QB_XS " rc=%d", pcmk_rc_str(rc), rc);
675  	        }
676  	    } else {
677  	        // Non-blocking (retry failures later in main loop)
678  	        rc = fencer_api->cmds->connect(fencer_api, crm_system_name, NULL);
679  	
680  	        if (controld_fencer_connect_timer == NULL) {
681  	            controld_fencer_connect_timer =
682  	                mainloop_timer_add("controld_fencer_connect", 1000,
683  	                                   TRUE, controld_timer_fencer_connect,
684  	                                   GINT_TO_POINTER(TRUE));
685  	        }
686  	
687  	        if (rc != pcmk_ok) {
688  	            if (pcmk__is_set(controld_globals.fsa_input_register,
689  	                             R_ST_REQUIRED)) {
690  	                pcmk__notice("Fencer connection failed (will retry): %s "
691  	                             QB_XS " rc=%d",
692  	                             pcmk_strerror(rc), rc);
693  	
694  	                if (!mainloop_timer_running(controld_fencer_connect_timer)) {
695  	                    mainloop_timer_start(controld_fencer_connect_timer);
696  	                }
697  	
698  	                return G_SOURCE_CONTINUE;
699  	            } else {
700  	                pcmk__info("Fencer connection failed (ignoring because no "
701  	                           "longer required): %s " QB_XS " rc=%d",
702  	                           pcmk_strerror(rc), rc);
703  	            }
704  	            return G_SOURCE_REMOVE;
705  	        }
706  	    }
707  	
708  	    if (rc == pcmk_ok) {
709  	        stonith_api_operations_t *cmds = fencer_api->cmds;
710  	
711  	        cmds->register_notification(fencer_api,
712  	                                    PCMK__VALUE_ST_NOTIFY_DISCONNECT,
713  	                                    destroy_fencer_connection);
714  	        cmds->register_notification(fencer_api, PCMK__VALUE_ST_NOTIFY_FENCE,
715  	                                    handle_fence_notification);
716  	        cmds->register_notification(fencer_api,
717  	                                    PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED,
718  	                                    fencing_history_synced);
719  	        controld_trigger_fencing_history_sync(true);
720  	        pcmk__notice("Fencer successfully connected");
721  	    }
722  	
723  	    return G_SOURCE_REMOVE;
724  	}
725  	
726  	void
727  	controld_disconnect_fencer(bool destroy)
728  	{
(1) Event path: Condition "fencer_api != NULL", taking true branch.
729  	    if (fencer_api != NULL) {
730  	        // Prevent fencer connection from coming up again
731  	        controld_clear_fsa_input_flags(R_ST_REQUIRED);
732  	
(2) Event path: Condition "fencer_api->state != stonith_disconnected", taking true branch.
733  	        if (fencer_api->state != stonith_disconnected) {
734  	            fencer_api->cmds->disconnect(fencer_api);
735  	        }
736  	        fencer_api->cmds->remove_notification(fencer_api, NULL);
737  	    }
(3) Event path: Condition "destroy", taking true branch.
738  	    if (destroy) {
(4) Event path: Condition "fencer_api != NULL", taking true branch.
739  	        if (fencer_api != NULL) {
CID (unavailable; MK=754c9290c5bc560bb158788589b9b74a) (#1 of 3): Inconsistent C union access (INCONSISTENT_UNION_ACCESS):
(5) Event assign_union_field: The union field "in" of "_pp" is written.
(6) Event inconsistent_union_field_access: In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in".
740  	            g_clear_pointer(&fencer_api, fencer_api->cmds->free);
741  	        }
742  	
743  	        g_clear_pointer(&controld_fencer_connect_timer, mainloop_timer_del);
744  	        g_clear_pointer(&te_client_id, free);
745  	    }
746  	}
747  	
748  	static gboolean
749  	sync_fencing_history(gpointer user_data)
750  	{
751  	    if ((fencer_api != NULL) && (fencer_api->state != stonith_disconnected)) {
752  	        stonith_history_t *history = NULL;
753  	
754  	        controld_cleanup_fencing_history_sync(fencer_api, false);
755  	        fencer_api->cmds->history(fencer_api, st_opt_sync_call|st_opt_broadcast,
756  	                                  NULL, &history, 5);
757  	        stonith__history_free(history);
758  	        return TRUE;
759  	    } else {
760  	        pcmk__info("Skipping triggering fencing history sync because fencer is "
761  	                   "disconnected");
762  	        return FALSE;
763  	    }
764  	}
765  	
766  	static void
767  	fencing_cb(stonith_t *stonith, stonith_callback_data_t *data)
768  	{
769  	    char *uuid = NULL;
770  	    int transition_id = -1;
771  	    int action_id = -1;
772  	    pcmk__graph_action_t *action = NULL;
773  	    const char *target = NULL;
774  	
775  	    if ((data == NULL) || (data->userdata == NULL)) {
776  	        pcmk__err("Ignoring fence operation %d result: No transition key given "
777  	                  "(bug?)",
778  	                  ((data == NULL)? -1 : data->call_id));
779  	        return;
780  	    }
781  	
782  	    if (!AM_I_DC) {
783  	        const char *reason = stonith__exit_reason(data);
784  	
785  	        if (reason == NULL) {
786  	           reason = pcmk_exec_status_str(stonith__execution_status(data));
787  	        }
788  	        pcmk__notice("Result of fence operation %d: %d (%s) " QB_XS " key=%s",
789  	                     data->call_id, stonith__exit_status(data), reason,
790  	                     (const char *) data->userdata);
791  	        return;
792  	    }
793  	
794  	    CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
795  	                                    &action_id, NULL),
796  	              goto bail);
797  	
798  	    if (controld_globals.transition_graph->complete || (action_id < 0)
799  	        || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
800  	        || (controld_globals.transition_graph->id != transition_id)) {
801  	
802  	        pcmk__info("Ignoring fence operation %d result: Not from current "
803  	                   "transition " QB_XS " complete=%s action=%d uuid=%s (vs %s) "
804  	                   "transition=%d (vs %d)",
805  	                   data->call_id,
806  	                   pcmk__btoa(controld_globals.transition_graph->complete),
807  	                   action_id, uuid, controld_globals.te_uuid, transition_id,
808  	                   controld_globals.transition_graph->id);
809  	        goto bail;
810  	    }
811  	
812  	    action = controld_get_action(action_id);
813  	    if (action == NULL) {
814  	        pcmk__err("Ignoring fence operation %d result: Action %d not found in "
815  	                  "transition graph (bug?) " QB_XS " uuid=%s transition=%d",
816  	                  data->call_id, action_id, uuid, transition_id);
817  	        goto bail;
818  	    }
819  	
820  	    target = pcmk__xe_get(action->xml, PCMK__META_ON_NODE);
821  	    if (target == NULL) {
822  	        pcmk__err("Ignoring fence operation %d result: No target given (bug?)",
823  	                  data->call_id);
824  	        goto bail;
825  	    }
826  	
827  	    stop_te_timer(action);
828  	    if (stonith__exit_status(data) == CRM_EX_OK) {
829  	        const char *uuid = pcmk__xe_get(action->xml, PCMK__META_ON_NODE_UUID);
830  	        const char *op = crm_meta_value(action->params,
831  	                                        PCMK__META_STONITH_ACTION);
832  	
833  	        pcmk__info("Fence operation %d for %s succeeded", data->call_id,
834  	                  target);
835  	        if (!(pcmk__is_set(action->flags, pcmk__graph_action_confirmed))) {
836  	            te_action_confirmed(action, NULL);
837  	            if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
838  	                const char *value = NULL;
839  	                char *now = pcmk__ttoa(time(NULL));
840  	                bool is_remote_node = false;
841  	
842  	                /* This check is not 100% reliable, since this node is not
843  	                 * guaranteed to have the remote node cached. However, it
844  	                 * doesn't have to be reliable, since the attribute manager can
845  	                 * learn a node's "remoteness" by other means sooner or later.
846  	                 * This allows it to learn more quickly if this node does have
847  	                 * the information.
848  	                 */
849  	                if (g_hash_table_lookup(pcmk__remote_peer_cache,
850  	                                        uuid) != NULL) {
851  	                    is_remote_node = true;
852  	                }
853  	
854  	                update_attrd(target, CRM_ATTR_UNFENCED, now, is_remote_node);
855  	                free(now);
856  	
857  	                value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL);
858  	                update_attrd(target, CRM_ATTR_DIGESTS_ALL, value,
859  	                             is_remote_node);
860  	
861  	                value = crm_meta_value(action->params,
862  	                                       PCMK__META_DIGESTS_SECURE);
863  	                update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value,
864  	                             is_remote_node);
865  	
866  	            } else if (!pcmk__is_set(action->flags,
867  	                                     pcmk__graph_action_sent_update)) {
868  	                update_node_state_after_fencing(target, uuid);
869  	                pcmk__set_graph_action_flags(action,
870  	                                             pcmk__graph_action_sent_update);
871  	            }
872  	        }
873  	        controld_reset_fencing_fail_count(target);
874  	
875  	    } else {
876  	        enum pcmk__graph_next abort_action = pcmk__graph_restart;
877  	        int status = stonith__execution_status(data);
878  	        const char *reason = stonith__exit_reason(data);
879  	
880  	        if (reason == NULL) {
881  	            if (status == PCMK_EXEC_DONE) {
882  	                reason = "Agent returned error";
883  	            } else {
884  	                reason = pcmk_exec_status_str(status);
885  	            }
886  	        }
887  	        pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
888  	
889  	        /* If no fence devices were available, there's no use in immediately
890  	         * checking again, so don't start a new transition in that case.
891  	         */
892  	        if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
893  	            pcmk__warn("Fence operation %d for %s failed: %s (aborting "
894  	                       "transition and giving up for now)",
895  	                       data->call_id, target, reason);
896  	            abort_action = pcmk__graph_wait;
897  	        } else {
898  	            pcmk__notice("Fence operation %d for %s failed: %s (aborting "
899  	                         "transition)",
900  	                         data->call_id, target, reason);
901  	        }
902  	
903  	        /* Increment the fail count now, so abort_for_fencing_failure() can
904  	         * check it. Non-DC nodes will increment it in
905  	         * handle_fence_notification().
906  	         */
907  	        increment_fencing_fail_count(target);
908  	        abort_for_fencing_failure(abort_action, target, NULL);
909  	    }
910  	
911  	    pcmk__update_graph(controld_globals.transition_graph, action);
912  	    trigger_graph();
913  	
914  	  bail:
915  	    free(data->userdata);
916  	    free(uuid);
917  	}
918  	
919  	static int
920  	fence_with_delay(const char *target, const char *type, int delay)
921  	{
922  	    uint32_t options = st_opt_none; // Group of enum stonith_call_options
923  	    int timeout_sec =
924  	        pcmk__timeout_ms2s(controld_globals.transition_graph->fencing_timeout);
925  	
926  	    if (crmd_join_phase_count(controld_join_confirmed) == 1) {
927  	        stonith__set_call_options(options, target, st_opt_allow_self_fencing);
928  	    }
929  	    return fencer_api->cmds->fence_with_delay(fencer_api, options, target, type,
930  	                                              timeout_sec, 0, delay);
931  	}
932  	
933  	/*!
934  	 * \internal
935  	 * \brief Execute a fencing action from a transition graph
936  	 *
937  	 * \param[in] graph   Transition graph being executed (ignored)
938  	 * \param[in] action  Fencing action to execute
939  	 *
940  	 * \return Standard Pacemaker return code
941  	 */
942  	int
943  	controld_execute_fence_action(pcmk__graph_t *graph,
944  	                              pcmk__graph_action_t *action)
945  	{
946  	    int rc = 0;
947  	    const char *id = pcmk__xe_id(action->xml);
948  	    const char *uuid = pcmk__xe_get(action->xml, PCMK__META_ON_NODE_UUID);
949  	    const char *target = pcmk__xe_get(action->xml, PCMK__META_ON_NODE);
950  	    const char *type = crm_meta_value(action->params,
951  	                                      PCMK__META_STONITH_ACTION);
952  	    char *transition_key = NULL;
953  	    const char *priority_delay = NULL;
954  	    int delay_i = 0;
955  	    gboolean invalid_action = FALSE;
956  	    int timeout_sec =
957  	        pcmk__timeout_ms2s(controld_globals.transition_graph->fencing_timeout);
958  	
959  	    CRM_CHECK(id != NULL, invalid_action = TRUE);
960  	    CRM_CHECK(uuid != NULL, invalid_action = TRUE);
961  	    CRM_CHECK(type != NULL, invalid_action = TRUE);
962  	    CRM_CHECK(target != NULL, invalid_action = TRUE);
963  	
964  	    if (invalid_action) {
965  	        pcmk__log_xml_warn(action->xml, "BadAction");
966  	        return EPROTO;
967  	    }
968  	
969  	    priority_delay = crm_meta_value(action->params,
970  	                                    PCMK_OPT_PRIORITY_FENCING_DELAY);
971  	
972  	    pcmk__notice("Requesting fencing (%s) targeting node %s "
973  	                 QB_XS " action=%s timeout=%i%s%s",
974  	                 type, target, id, timeout_sec,
975  	                 ((priority_delay != NULL)? " priority_delay=" : ""),
976  	                 pcmk__s(priority_delay, ""));
977  	
978  	    /* Passing NULL means block until we can connect... */
979  	    controld_timer_fencer_connect(NULL);
980  	
981  	    pcmk__scan_min_int(priority_delay, &delay_i, 0);
982  	    rc = fence_with_delay(target, type, delay_i);
983  	    transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
984  	                                          action->id, 0,
985  	                                          controld_globals.te_uuid),
986  	    fencer_api->cmds->register_callback(fencer_api, rc,
987  	                                        (timeout_sec
988  	                                         + (delay_i > 0 ? delay_i : 0)),
989  	                                        st_opt_timeout_updates, transition_key,
990  	                                        "fencing_cb", fencing_cb);
991  	    return pcmk_rc_ok;
992  	}
993  	
994  	void
995  	controld_validate_fencing_watchdog_timeout(const char *value)
996  	{
997  	    const char *our_nodename = controld_globals.cluster->priv->node_name;
998  	
999  	    // Validate only if the timeout will be used
1000 	    if ((fencer_api != NULL) && (fencer_api->state != stonith_disconnected)
1001 	        && stonith__watchdog_fencing_enabled_for_node_api(fencer_api,
1002 	                                                          our_nodename)) {
1003 	
1004 	        pcmk__valid_fencing_watchdog_timeout(value);
1005 	    }
1006 	}
1007 	
1008 	/* end fencer API client functions */
1009 	
1010 	
1011 	/*
1012 	 * Fencing history synchronization
1013 	 *
1014 	 * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1015 	 * joins or leaves, we need to synchronize the history across all nodes.
1016 	 */
1017 	
1018 	static crm_trigger_t *fencing_history_sync_trigger = NULL;
1019 	static mainloop_timer_t *fencing_history_sync_timer_short = NULL;
1020 	static mainloop_timer_t *fencing_history_sync_timer_long = NULL;
1021 	
1022 	void
1023 	controld_cleanup_fencing_history_sync(stonith_t *st, bool free_timers)
1024 	{
1025 	    if (free_timers) {
1026 	        g_clear_pointer(&fencing_history_sync_timer_short, mainloop_timer_del);
1027 	        g_clear_pointer(&fencing_history_sync_timer_long, mainloop_timer_del);
1028 	
1029 	    } else {
1030 	        mainloop_timer_stop(fencing_history_sync_timer_short);
1031 	        mainloop_timer_stop(fencing_history_sync_timer_long);
1032 	    }
1033 	
1034 	    if (st) {
1035 	        st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED);
1036 	    }
1037 	}
1038 	
1039 	static void
1040 	fencing_history_synced(stonith_t *st, stonith_event_t *st_event)
1041 	{
1042 	    controld_cleanup_fencing_history_sync(st, false);
1043 	    pcmk__debug("Fencing history synced - cancel all timers");
1044 	}
1045 	
1046 	static gboolean
1047 	fencing_history_sync_set_trigger(gpointer user_data)
1048 	{
1049 	    mainloop_set_trigger(fencing_history_sync_trigger);
1050 	    return FALSE;
1051 	}
1052 	
1053 	void
1054 	controld_trigger_fencing_history_sync(bool long_timeout)
1055 	{
1056 	    /* trigger a sync in 5s to give more nodes the
1057 	     * chance to show up so that we don't create
1058 	     * unnecessary fencing-history-sync traffic
1059 	     *
1060 	     * the long timeout of 30s is there as a fallback
1061 	     * so that after a successful connection to fenced
1062 	     * we will wait for 30s for the DC to trigger a
1063 	     * history-sync
1064 	     * if this doesn't happen we trigger a sync locally
1065 	     * (e.g. fenced segfaults and is restarted by pacemakerd)
1066 	     */
1067 	
1068 	    /* as we are finally checking the fencer connection
1069 	     * in sync_fencing_history() we should be fine
1070 	     * leaving fencing_history_sync_timer_short,
1071 	     * fencing_history_sync_timer_long, and fencing_history_sync_trigger
1072 	     * around
1073 	     */
1074 	    if (fencing_history_sync_trigger == NULL) {
1075 	        fencing_history_sync_trigger =
1076 	            mainloop_add_trigger(G_PRIORITY_LOW, sync_fencing_history, NULL);
1077 	    }
1078 	
1079 	    if (long_timeout) {
1080 	        if (fencing_history_sync_timer_long == NULL) {
1081 	            fencing_history_sync_timer_long =
1082 	                mainloop_timer_add("history_sync_long", 30000,
1083 	                                   FALSE, fencing_history_sync_set_trigger,
1084 	                                   NULL);
1085 	        }
1086 	        pcmk__info("Fence history will be synchronized cluster-wide within 30 "
1087 	                   "seconds");
1088 	        mainloop_timer_start(fencing_history_sync_timer_long);
1089 	
1090 	    } else {
1091 	        if (fencing_history_sync_timer_short == NULL) {
1092 	            fencing_history_sync_timer_short =
1093 	                mainloop_timer_add("history_sync_short", 5000,
1094 	                                   FALSE, fencing_history_sync_set_trigger,
1095 	                                   NULL);
1096 	        }
1097 	        pcmk__info("Fence history will be synchronized cluster-wide within 5 "
1098 	                   "seconds");
1099 	        mainloop_timer_start(fencing_history_sync_timer_short);
1100 	    }
1101 	
1102 	}
1103 	
1104 	/* end fencing history synchronization functions */
1105