1    	/*
2    	 * Copyright 2004-2023 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	#include <crm/crm.h>
12   	#include <crm/msg_xml.h>
13   	#include <crm/common/xml.h>
14   	#include <crm/stonith-ng.h>
15   	#include <crm/fencing/internal.h>
16   	
17   	#include <pacemaker-controld.h>
18   	
19   	static void
20   	tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
21   	
22   	/*
23   	 * stonith failure counting
24   	 *
25   	 * We don't want to get stuck in a permanent fencing loop. Keep track of the
26   	 * number of fencing failures for each target node, and the most we'll restart a
27   	 * transition for.
28   	 */
29   	
30   	struct st_fail_rec {
31   	    int count;
32   	};
33   	
34   	static bool fence_reaction_panic = false;
35   	static unsigned long int stonith_max_attempts = 10;
36   	static GHashTable *stonith_failures = NULL;
37   	
38   	/*!
39   	 * \internal
40   	 * \brief Update max fencing attempts before giving up
41   	 *
42   	 * \param[in] value  New max fencing attempts
43   	 */
44   	static void
45   	update_stonith_max_attempts(const char *value)
46   	{
47   	    stonith_max_attempts = char2score(value);
48   	    if (stonith_max_attempts < 1UL) {
49   	        stonith_max_attempts = 10UL;
50   	    }
51   	}
52   	
53   	/*!
54   	 * \internal
55   	 * \brief Configure reaction to notification of local node being fenced
56   	 *
57   	 * \param[in] reaction_s  Reaction type
58   	 */
59   	static void
60   	set_fence_reaction(const char *reaction_s)
61   	{
62   	    if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
63   	        fence_reaction_panic = true;
64   	
65   	    } else {
66   	        if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
67   	            crm_warn("Invalid value '%s' for %s, using 'stop'",
68   	                     reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
69   	        }
70   	        fence_reaction_panic = false;
71   	    }
72   	}
73   	
74   	/*!
75   	 * \internal
76   	 * \brief Configure fencing options based on the CIB
77   	 *
78   	 * \param[in,out] options  Name/value pairs for configured options
79   	 */
80   	void
81   	controld_configure_fencing(GHashTable *options)
82   	{
83   	    const char *value = NULL;
84   	
85   	    value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FENCE_REACTION);
86   	    set_fence_reaction(value);
87   	
88   	    value = g_hash_table_lookup(options, "stonith-max-attempts");
89   	    update_stonith_max_attempts(value);
90   	}
91   	
92   	static gboolean
93   	too_many_st_failures(const char *target)
94   	{
95   	    GHashTableIter iter;
96   	    const char *key = NULL;
97   	    struct st_fail_rec *value = NULL;
98   	
99   	    if (stonith_failures == NULL) {
100  	        return FALSE;
101  	    }
102  	
103  	    if (target == NULL) {
104  	        g_hash_table_iter_init(&iter, stonith_failures);
105  	        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
106  	               (gpointer *) &value)) {
107  	
108  	            if (value->count >= stonith_max_attempts) {
109  	                target = (const char*)key;
110  	                goto too_many;
111  	            }
112  	        }
113  	    } else {
114  	        value = g_hash_table_lookup(stonith_failures, target);
115  	        if ((value != NULL) && (value->count >= stonith_max_attempts)) {
116  	            goto too_many;
117  	        }
118  	    }
119  	    return FALSE;
120  	
121  	too_many:
122  	    crm_warn("Too many failures (%d) to fence %s, giving up",
123  	             value->count, target);
124  	    return TRUE;
125  	}
126  	
127  	/*!
128  	 * \internal
129  	 * \brief Reset a stonith fail count
130  	 *
131  	 * \param[in] target  Name of node to reset, or NULL for all
132  	 */
133  	void
134  	st_fail_count_reset(const char *target)
135  	{
136  	    if (stonith_failures == NULL) {
137  	        return;
138  	    }
139  	
140  	    if (target) {
141  	        struct st_fail_rec *rec = NULL;
142  	
143  	        rec = g_hash_table_lookup(stonith_failures, target);
144  	        if (rec) {
145  	            rec->count = 0;
146  	        }
147  	    } else {
148  	        GHashTableIter iter;
149  	        const char *key = NULL;
150  	        struct st_fail_rec *rec = NULL;
151  	
152  	        g_hash_table_iter_init(&iter, stonith_failures);
153  	        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
154  	                                      (gpointer *) &rec)) {
155  	            rec->count = 0;
156  	        }
157  	    }
158  	}
159  	
160  	static void
161  	st_fail_count_increment(const char *target)
162  	{
163  	    struct st_fail_rec *rec = NULL;
164  	
165  	    if (stonith_failures == NULL) {
166  	        stonith_failures = pcmk__strkey_table(free, free);
167  	    }
168  	
169  	    rec = g_hash_table_lookup(stonith_failures, target);
170  	    if (rec) {
171  	        rec->count++;
172  	    } else {
173  	        rec = malloc(sizeof(struct st_fail_rec));
174  	        if(rec == NULL) {
175  	            return;
176  	        }
177  	
178  	        rec->count = 1;
179  	        g_hash_table_insert(stonith_failures, strdup(target), rec);
180  	    }
181  	}
182  	
183  	/* end stonith fail count functions */
184  	
185  	
186  	static void
187  	cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
188  	                    void *user_data)
189  	{
190  	    if (rc < pcmk_ok) {
191  	        crm_err("Fencing update %d for %s: failed - %s (%d)",
192  	                call_id, (char *)user_data, pcmk_strerror(rc), rc);
193  	        crm_log_xml_warn(msg, "Failed update");
194  	        abort_transition(INFINITY, pcmk__graph_shutdown, "CIB update failed",
195  	                         NULL);
196  	
197  	    } else {
198  	        crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
199  	    }
200  	}
201  	
202  	static void
203  	send_stonith_update(pcmk__graph_action_t *action, const char *target,
204  	                    const char *uuid)
205  	{
206  	    int rc = pcmk_ok;
207  	    crm_node_t *peer = NULL;
208  	
209  	    /* We (usually) rely on the membership layer to do node_update_cluster,
210  	     * and the peer status callback to do node_update_peer, because the node
211  	     * might have already rejoined before we get the stonith result here.
212  	     */
213  	    int flags = node_update_join | node_update_expected;
214  	
215  	    /* zero out the node-status & remove all LRM status info */
216  	    xmlNode *node_state = NULL;
217  	
218  	    CRM_CHECK(target != NULL, return);
219  	    CRM_CHECK(uuid != NULL, return);
220  	
221  	    /* Make sure the membership and join caches are accurate.
222  	     * Try getting any existing node cache entry also by node uuid in case it
223  	     * doesn't have an uname yet.
224  	     */
225  	    peer = pcmk__get_peer_full(0, target, uuid, CRM_GET_PEER_ANY);
226  	
227  	    CRM_CHECK(peer != NULL, return);
228  	
229  	    if (peer->state == NULL) {
230  	        /* Usually, we rely on the membership layer to update the cluster state
231  	         * in the CIB. However, if the node has never been seen, do it here, so
232  	         * the node is not considered unclean.
233  	         */
234  	        flags |= node_update_cluster;
235  	    }
236  	
237  	    if (peer->uuid == NULL) {
238  	        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
239  	        peer->uuid = strdup(uuid);
240  	    }
241  	
242  	    crmd_peer_down(peer, TRUE);
243  	
244  	    /* Generate a node state update for the CIB */
245  	    node_state = create_node_state_update(peer, flags, NULL, __func__);
246  	
247  	    /* we have to mark whether or not remote nodes have already been fenced */
248  	    if (peer->flags & crm_remote_node) {
249  	        char *now_s = pcmk__ttoa(time(NULL));
250  	
251  	        crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
252  	        free(now_s);
253  	    }
254  	
255  	    /* Force our known ID */
256  	    crm_xml_add(node_state, XML_ATTR_ID, uuid);
257  	
258  	    rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
259  	                                                 XML_CIB_TAG_STATUS, node_state,
260  	                                                 cib_scope_local
261  	                                                 |cib_can_create);
262  	
263  	    /* Delay processing the trigger until the update completes */
264  	    crm_debug("Sending fencing update %d for %s", rc, target);
265  	    fsa_register_cib_callback(rc, strdup(target), cib_fencing_updated);
266  	
267  	    // Make sure it sticks
268  	    /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn,
269  	     *                                             cib_scope_local);
270  	     */
271  	
272  	    controld_delete_node_state(peer->uname, controld_section_all,
273  	                               cib_scope_local);
274  	    free_xml(node_state);
275  	    return;
276  	}
277  	
278  	/*!
279  	 * \internal
280  	 * \brief Abort transition due to stonith failure
281  	 *
282  	 * \param[in] abort_action  Whether to restart or stop transition
283  	 * \param[in] target  Don't restart if this (NULL for any) has too many failures
284  	 * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
285  	 */
286  	static void
287  	abort_for_stonith_failure(enum pcmk__graph_next abort_action,
288  	                          const char *target, const xmlNode *reason)
289  	{
290  	    /* If stonith repeatedly fails, we eventually give up on starting a new
291  	     * transition for that reason.
292  	     */
293  	    if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
294  	        abort_action = pcmk__graph_wait;
295  	    }
296  	    abort_transition(INFINITY, abort_action, "Stonith failed", reason);
297  	}
298  	
299  	
300  	/*
301  	 * stonith cleanup list
302  	 *
303  	 * If the DC is shot, proper notifications might not go out.
304  	 * The stonith cleanup list allows the cluster to (re-)send
305  	 * notifications once a new DC is elected.
306  	 */
307  	
308  	static GList *stonith_cleanup_list = NULL;
309  	
310  	/*!
311  	 * \internal
312  	 * \brief Add a node to the stonith cleanup list
313  	 *
314  	 * \param[in] target  Name of node to add
315  	 */
316  	void
317  	add_stonith_cleanup(const char *target) {
318  	    stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
319  	}
320  	
321  	/*!
322  	 * \internal
323  	 * \brief Remove a node from the stonith cleanup list
324  	 *
325  	 * \param[in] Name of node to remove
326  	 */
327  	void
328  	remove_stonith_cleanup(const char *target)
329  	{
330  	    GList *iter = stonith_cleanup_list;
331  	
332  	    while (iter != NULL) {
333  	        GList *tmp = iter;
334  	        char *iter_name = tmp->data;
335  	
336  	        iter = iter->next;
337  	        if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
338  	            crm_trace("Removing %s from the cleanup list", iter_name);
339  	            stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
340  	            free(iter_name);
341  	        }
342  	    }
343  	}
344  	
345  	/*!
346  	 * \internal
347  	 * \brief Purge all entries from the stonith cleanup list
348  	 */
349  	void
350  	purge_stonith_cleanup(void)
351  	{
352  	    if (stonith_cleanup_list) {
353  	        GList *iter = NULL;
354  	
(51) Event example_checked: Example 5: "iter->next" has its value checked in "iter != NULL".
Also see events: [null_field][alias_transfer][dereference][example_checked][example_checked][example_checked][example_checked]
355  	        for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
356  	            char *target = iter->data;
357  	
358  	            crm_info("Purging %s from stonith cleanup list", target);
359  	            free(target);
360  	        }
361  	        g_list_free(stonith_cleanup_list);
362  	        stonith_cleanup_list = NULL;
363  	    }
364  	}
365  	
366  	/*!
367  	 * \internal
368  	 * \brief Send stonith updates for all entries in cleanup list, then purge it
369  	 */
370  	void
371  	execute_stonith_cleanup(void)
372  	{
373  	    GList *iter;
374  	
375  	    for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
376  	        char *target = iter->data;
377  	        crm_node_t *target_node = crm_get_peer(0, target);
378  	        const char *uuid = crm_peer_uuid(target_node);
379  	
380  	        crm_notice("Marking %s, target of a previous stonith action, as clean", target);
381  	        send_stonith_update(NULL, target, uuid);
382  	        free(target);
383  	    }
384  	    g_list_free(stonith_cleanup_list);
385  	    stonith_cleanup_list = NULL;
386  	}
387  	
388  	/* end stonith cleanup list functions */
389  	
390  	
391  	/* stonith API client
392  	 *
393  	 * Functions that need to interact directly with the fencer via its API
394  	 */
395  	
396  	static stonith_t *stonith_api = NULL;
397  	static mainloop_timer_t *controld_fencer_connect_timer = NULL;
398  	static char *te_client_id = NULL;
399  	
400  	static gboolean
401  	fail_incompletable_stonith(pcmk__graph_t *graph)
402  	{
403  	    GList *lpc = NULL;
404  	    const char *task = NULL;
405  	    xmlNode *last_action = NULL;
406  	
407  	    if (graph == NULL) {
408  	        return FALSE;
409  	    }
410  	
411  	    for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
412  	        GList *lpc2 = NULL;
413  	        pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
414  	
415  	        if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
416  	            continue;
417  	        }
418  	
419  	        for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
420  	            pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
421  	
422  	            if ((action->type != pcmk__cluster_graph_action)
423  	                || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
424  	                continue;
425  	            }
426  	
427  	            task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
428  	            if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
429  	                pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
430  	                last_action = action->xml;
431  	                pcmk__update_graph(graph, action);
432  	                crm_notice("Failing action %d (%s): fencer terminated",
433  	                           action->id, ID(action->xml));
434  	            }
435  	        }
436  	    }
437  	
438  	    if (last_action != NULL) {
439  	        crm_warn("Fencer failure resulted in unrunnable actions");
440  	        abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
441  	        return TRUE;
442  	    }
443  	
444  	    return FALSE;
445  	}
446  	
447  	static void
448  	tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
449  	{
450  	    te_cleanup_stonith_history_sync(st, FALSE);
451  	
452  	    if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
453  	        crm_err("Lost fencer connection (will attempt to reconnect)");
454  	        if (!mainloop_timer_running(controld_fencer_connect_timer)) {
455  	            mainloop_timer_start(controld_fencer_connect_timer);
456  	        }
457  	    } else {
458  	        crm_info("Disconnected from fencer");
459  	    }
460  	
461  	    if (stonith_api) {
462  	        /* the client API won't properly reconnect notifications
463  	         * if they are still in the table - so remove them
464  	         */
465  	        if (stonith_api->state != stonith_disconnected) {
466  	            stonith_api->cmds->disconnect(st);
467  	        }
468  	        stonith_api->cmds->remove_notification(stonith_api, NULL);
469  	    }
470  	
471  	    if (AM_I_DC) {
472  	        fail_incompletable_stonith(controld_globals.transition_graph);
473  	        trigger_graph();
474  	    }
475  	}
476  	
477  	/*!
478  	 * \internal
479  	 * \brief Handle an event notification from the fencing API
480  	 *
481  	 * \param[in] st     Fencing API connection (ignored)
482  	 * \param[in] event  Fencing API event notification
483  	 */
484  	static void
485  	handle_fence_notification(stonith_t *st, stonith_event_t *event)
486  	{
487  	    bool succeeded = true;
488  	    const char *executioner = "the cluster";
489  	    const char *client = "a client";
490  	    const char *reason = NULL;
491  	    int exec_status;
492  	
493  	    if (te_client_id == NULL) {
494  	        te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
495  	                                         (unsigned long) getpid());
496  	    }
497  	
498  	    if (event == NULL) {
499  	        crm_err("Notify data not found");
500  	        return;
501  	    }
502  	
503  	    if (event->executioner != NULL) {
504  	        executioner = event->executioner;
505  	    }
506  	    if (event->client_origin != NULL) {
507  	        client = event->client_origin;
508  	    }
509  	
510  	    exec_status = stonith__event_execution_status(event);
511  	    if ((stonith__event_exit_status(event) != CRM_EX_OK)
512  	        || (exec_status != PCMK_EXEC_DONE)) {
513  	        succeeded = false;
514  	        if (exec_status == PCMK_EXEC_DONE) {
515  	            exec_status = PCMK_EXEC_ERROR;
516  	        }
517  	    }
518  	    reason = stonith__event_exit_reason(event);
519  	
520  	    crmd_alert_fencing_op(event);
521  	
522  	    if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
523  	        // Unfencing doesn't need special handling, just a log message
524  	        if (succeeded) {
525  	            crm_notice("%s was unfenced by %s at the request of %s@%s",
526  	                       event->target, executioner, client, event->origin);
527  	        } else {
528  	            crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
529  	                    event->target, executioner,
530  	                    pcmk_exec_status_str(exec_status),
531  	                    ((reason == NULL)? "" : ": "),
532  	                    ((reason == NULL)? "" : reason),
533  	                    stonith__event_exit_status(event));
534  	        }
535  	        return;
536  	    }
537  	
538  	    if (succeeded
539  	        && pcmk__str_eq(event->target, controld_globals.our_nodename,
540  	                        pcmk__str_casei)) {
541  	        /* We were notified of our own fencing. Most likely, either fencing was
542  	         * misconfigured, or fabric fencing that doesn't cut cluster
543  	         * communication is in use.
544  	         *
545  	         * Either way, shutting down the local host is a good idea, to require
546  	         * administrator intervention. Also, other nodes would otherwise likely
547  	         * set our status to lost because of the fencing callback and discard
548  	         * our subsequent election votes as "not part of our cluster".
549  	         */
550  	        crm_crit("We were allegedly just fenced by %s for %s!",
551  	                 executioner, event->origin); // Dumps blackbox if enabled
552  	        if (fence_reaction_panic) {
553  	            pcmk__panic(__func__);
554  	        } else {
555  	            crm_exit(CRM_EX_FATAL);
556  	        }
557  	        return; // Should never get here
558  	    }
559  	
560  	    /* Update the count of fencing failures for this target, in case we become
561  	     * DC later. The current DC has already updated its fail count in
562  	     * tengine_stonith_callback().
563  	     */
564  	    if (!AM_I_DC) {
565  	        if (succeeded) {
566  	            st_fail_count_reset(event->target);
567  	        } else {
568  	            st_fail_count_increment(event->target);
569  	        }
570  	    }
571  	
572  	    crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
573  	               "%s%s%s%s " CRM_XS " event=%s",
574  	               event->target, (succeeded? "" : " not"),
575  	               event->action, executioner, client, event->origin,
576  	               (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
577  	               ((reason == NULL)? "" : " ("),
578  	               ((reason == NULL)? "" : reason),
579  	               ((reason == NULL)? "" : ")"),
580  	               event->id);
581  	
582  	    if (succeeded) {
583  	        crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
584  	                                                         CRM_GET_PEER_ANY);
585  	        const char *uuid = NULL;
586  	
587  	        if (peer == NULL) {
588  	            return;
589  	        }
590  	
591  	        uuid = crm_peer_uuid(peer);
592  	
593  	        if (AM_I_DC) {
594  	            /* The DC always sends updates */
595  	            send_stonith_update(NULL, event->target, uuid);
596  	
597  	            /* @TODO Ideally, at this point, we'd check whether the fenced node
598  	             * hosted any guest nodes, and call remote_node_down() for them.
599  	             * Unfortunately, the controller doesn't have a simple, reliable way
600  	             * to map hosts to guests. It might be possible to track this in the
601  	             * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
602  	             * on the scheduler creating fence pseudo-events for the guests.
603  	             */
604  	
605  	            if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
606  	                /* Abort the current transition if it wasn't the cluster that
607  	                 * initiated fencing.
608  	                 */
609  	                crm_info("External fencing operation from %s fenced %s",
610  	                         client, event->target);
611  	                abort_transition(INFINITY, pcmk__graph_restart,
612  	                                 "External Fencing Operation", NULL);
613  	            }
614  	
615  	        } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
616  	                                pcmk__str_null_matches|pcmk__str_casei)
617  	                   && !pcmk_is_set(peer->flags, crm_remote_node)) {
618  	            // Assume the target was our DC if we don't currently have one
619  	
620  	            if (controld_globals.dc_name != NULL) {
621  	                crm_notice("Fencing target %s was our DC", event->target);
622  	            } else {
623  	                crm_notice("Fencing target %s may have been our DC",
624  	                           event->target);
625  	            }
626  	
627  	            /* Given the CIB resyncing that occurs around elections,
628  	             * have one node update the CIB now and, if the new DC is different,
629  	             * have them do so too after the election
630  	             */
631  	            if (pcmk__str_eq(event->executioner, controld_globals.our_nodename,
632  	                             pcmk__str_casei)) {
633  	                send_stonith_update(NULL, event->target, uuid);
634  	            }
635  	            add_stonith_cleanup(event->target);
636  	        }
637  	
638  	        /* If the target is a remote node, and we host its connection,
639  	         * immediately fail all monitors so it can be recovered quickly.
640  	         * The connection won't necessarily drop when a remote node is fenced,
641  	         * so the failure might not otherwise be detected until the next poke.
642  	         */
643  	        if (pcmk_is_set(peer->flags, crm_remote_node)) {
644  	            remote_ra_fail(event->target);
645  	        }
646  	
647  	        crmd_peer_down(peer, TRUE);
648  	     }
649  	}
650  	
651  	/*!
652  	 * \brief Connect to fencer
653  	 *
654  	 * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
655  	 *
656  	 * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
657  	 * \note If user_data is NULL, this will wait 2s between attempts, for up to
658  	 *       30 attempts, meaning the controller could be blocked as long as 58s.
659  	 */
660  	gboolean
661  	controld_timer_fencer_connect(gpointer user_data)
662  	{
663  	    int rc = pcmk_ok;
664  	
665  	    if (stonith_api == NULL) {
666  	        stonith_api = stonith_api_new();
667  	        if (stonith_api == NULL) {
668  	            crm_err("Could not connect to fencer: API memory allocation failed");
669  	            return G_SOURCE_REMOVE;
670  	        }
671  	    }
672  	
673  	    if (stonith_api->state != stonith_disconnected) {
674  	        crm_trace("Already connected to fencer, no need to retry");
675  	        return G_SOURCE_REMOVE;
676  	    }
677  	
678  	    if (user_data == NULL) {
679  	        // Blocking (retry failures now until successful)
680  	        rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
681  	        if (rc != pcmk_ok) {
682  	            crm_err("Could not connect to fencer in 30 attempts: %s "
683  	                    CRM_XS " rc=%d", pcmk_strerror(rc), rc);
684  	        }
685  	    } else {
686  	        // Non-blocking (retry failures later in main loop)
687  	        rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
688  	
689  	        if (controld_fencer_connect_timer == NULL) {
690  	            controld_fencer_connect_timer =
691  	                mainloop_timer_add("controld_fencer_connect", 1000,
692  	                                   TRUE, controld_timer_fencer_connect,
693  	                                   GINT_TO_POINTER(TRUE));
694  	        }
695  	
696  	        if (rc != pcmk_ok) {
697  	            if (pcmk_is_set(controld_globals.fsa_input_register,
698  	                            R_ST_REQUIRED)) {
699  	                crm_notice("Fencer connection failed (will retry): %s "
700  	                           CRM_XS " rc=%d", pcmk_strerror(rc), rc);
701  	
702  	                if (!mainloop_timer_running(controld_fencer_connect_timer)) {
703  	                    mainloop_timer_start(controld_fencer_connect_timer);
704  	                }
705  	
706  	                return G_SOURCE_CONTINUE;
707  	            } else {
708  	                crm_info("Fencer connection failed (ignoring because no longer required): %s "
709  	                         CRM_XS " rc=%d", pcmk_strerror(rc), rc);
710  	            }
711  	            return G_SOURCE_REMOVE;
712  	        }
713  	    }
714  	
715  	    if (rc == pcmk_ok) {
716  	        stonith_api->cmds->register_notification(stonith_api,
717  	                                                 T_STONITH_NOTIFY_DISCONNECT,
718  	                                                 tengine_stonith_connection_destroy);
719  	        stonith_api->cmds->register_notification(stonith_api,
720  	                                                 T_STONITH_NOTIFY_FENCE,
721  	                                                 handle_fence_notification);
722  	        stonith_api->cmds->register_notification(stonith_api,
723  	                                                 T_STONITH_NOTIFY_HISTORY_SYNCED,
724  	                                                 tengine_stonith_history_synced);
725  	        te_trigger_stonith_history_sync(TRUE);
726  	        crm_notice("Fencer successfully connected");
727  	    }
728  	
729  	    return G_SOURCE_REMOVE;
730  	}
731  	
732  	void
733  	controld_disconnect_fencer(bool destroy)
734  	{
735  	    if (stonith_api) {
736  	        // Prevent fencer connection from coming up again
737  	        controld_clear_fsa_input_flags(R_ST_REQUIRED);
738  	
739  	        if (stonith_api->state != stonith_disconnected) {
740  	            stonith_api->cmds->disconnect(stonith_api);
741  	        }
742  	        stonith_api->cmds->remove_notification(stonith_api, NULL);
743  	    }
744  	    if (destroy) {
745  	        if (stonith_api) {
746  	            stonith_api->cmds->free(stonith_api);
747  	            stonith_api = NULL;
748  	        }
749  	        if (controld_fencer_connect_timer) {
750  	            mainloop_timer_del(controld_fencer_connect_timer);
751  	            controld_fencer_connect_timer = NULL;
752  	        }
753  	        if (te_client_id) {
754  	            free(te_client_id);
755  	            te_client_id = NULL;
756  	        }
757  	    }
758  	}
759  	
760  	static gboolean
761  	do_stonith_history_sync(gpointer user_data)
762  	{
763  	    if (stonith_api && (stonith_api->state != stonith_disconnected)) {
764  	        stonith_history_t *history = NULL;
765  	
766  	        te_cleanup_stonith_history_sync(stonith_api, FALSE);
767  	        stonith_api->cmds->history(stonith_api,
768  	                                   st_opt_sync_call | st_opt_broadcast,
769  	                                   NULL, &history, 5);
770  	        stonith_history_free(history);
771  	        return TRUE;
772  	    } else {
773  	        crm_info("Skip triggering stonith history-sync as stonith is disconnected");
774  	        return FALSE;
775  	    }
776  	}
777  	
778  	static void
779  	tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
780  	{
781  	    char *uuid = NULL;
782  	    int stonith_id = -1;
783  	    int transition_id = -1;
784  	    pcmk__graph_action_t *action = NULL;
785  	    const char *target = NULL;
786  	
787  	    if ((data == NULL) || (data->userdata == NULL)) {
788  	        crm_err("Ignoring fence operation %d result: "
789  	                "No transition key given (bug?)",
790  	                ((data == NULL)? -1 : data->call_id));
791  	        return;
792  	    }
793  	
794  	    if (!AM_I_DC) {
795  	        const char *reason = stonith__exit_reason(data);
796  	
797  	        if (reason == NULL) {
798  	           reason = pcmk_exec_status_str(stonith__execution_status(data));
799  	        }
800  	        crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
801  	                   data->call_id, stonith__exit_status(data), reason,
802  	                   (const char *) data->userdata);
803  	        return;
804  	    }
805  	
806  	    CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
807  	                                    &stonith_id, NULL),
808  	              goto bail);
809  	
810  	    if (controld_globals.transition_graph->complete || (stonith_id < 0)
811  	        || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
812  	        || (controld_globals.transition_graph->id != transition_id)) {
813  	        crm_info("Ignoring fence operation %d result: "
814  	                 "Not from current transition " CRM_XS
815  	                 " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
816  	                 data->call_id,
817  	                 pcmk__btoa(controld_globals.transition_graph->complete),
818  	                 stonith_id, uuid, controld_globals.te_uuid, transition_id,
819  	                 controld_globals.transition_graph->id);
820  	        goto bail;
821  	    }
822  	
823  	    action = controld_get_action(stonith_id);
824  	    if (action == NULL) {
825  	        crm_err("Ignoring fence operation %d result: "
826  	                "Action %d not found in transition graph (bug?) "
827  	                CRM_XS " uuid=%s transition=%d",
828  	                data->call_id, stonith_id, uuid, transition_id);
829  	        goto bail;
830  	    }
831  	
832  	    target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
833  	    if (target == NULL) {
834  	        crm_err("Ignoring fence operation %d result: No target given (bug?)",
835  	                data->call_id);
836  	        goto bail;
837  	    }
838  	
839  	    stop_te_timer(action);
840  	    if (stonith__exit_status(data) == CRM_EX_OK) {
841  	        const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
842  	        const char *op = crm_meta_value(action->params, "stonith_action");
843  	
844  	        crm_info("Fence operation %d for %s succeeded", data->call_id, target);
845  	        if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
846  	            te_action_confirmed(action, NULL);
847  	            if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
848  	                const char *value = NULL;
849  	                char *now = pcmk__ttoa(time(NULL));
850  	                gboolean is_remote_node = FALSE;
851  	
852  	                /* This check is not 100% reliable, since this node is not
853  	                 * guaranteed to have the remote node cached. However, it
854  	                 * doesn't have to be reliable, since the attribute manager can
855  	                 * learn a node's "remoteness" by other means sooner or later.
856  	                 * This allows it to learn more quickly if this node does have
857  	                 * the information.
858  	                 */
859  	                if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
860  	                    is_remote_node = TRUE;
861  	                }
862  	
863  	                update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
864  	                             is_remote_node);
865  	                free(now);
866  	
867  	                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
868  	                update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
869  	                             is_remote_node);
870  	
871  	                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
872  	                update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
873  	                             is_remote_node);
874  	
875  	            } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
876  	                send_stonith_update(action, target, uuid);
877  	                pcmk__set_graph_action_flags(action,
878  	                                             pcmk__graph_action_sent_update);
879  	            }
880  	        }
881  	        st_fail_count_reset(target);
882  	
883  	    } else {
884  	        enum pcmk__graph_next abort_action = pcmk__graph_restart;
885  	        int status = stonith__execution_status(data);
886  	        const char *reason = stonith__exit_reason(data);
887  	
888  	        if (reason == NULL) {
889  	            if (status == PCMK_EXEC_DONE) {
890  	                reason = "Agent returned error";
891  	            } else {
892  	                reason = pcmk_exec_status_str(status);
893  	            }
894  	        }
895  	        pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
896  	
897  	        /* If no fence devices were available, there's no use in immediately
898  	         * checking again, so don't start a new transition in that case.
899  	         */
900  	        if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
901  	            crm_warn("Fence operation %d for %s failed: %s "
902  	                     "(aborting transition and giving up for now)",
903  	                     data->call_id, target, reason);
904  	            abort_action = pcmk__graph_wait;
905  	        } else {
906  	            crm_notice("Fence operation %d for %s failed: %s "
907  	                       "(aborting transition)", data->call_id, target, reason);
908  	        }
909  	
910  	        /* Increment the fail count now, so abort_for_stonith_failure() can
911  	         * check it. Non-DC nodes will increment it in
912  	         * handle_fence_notification().
913  	         */
914  	        st_fail_count_increment(target);
915  	        abort_for_stonith_failure(abort_action, target, NULL);
916  	    }
917  	
918  	    pcmk__update_graph(controld_globals.transition_graph, action);
919  	    trigger_graph();
920  	
921  	  bail:
922  	    free(data->userdata);
923  	    free(uuid);
924  	    return;
925  	}
926  	
927  	static int
928  	fence_with_delay(const char *target, const char *type, int delay)
929  	{
930  	    uint32_t options = st_opt_none; // Group of enum stonith_call_options
931  	    int timeout_sec = (int) (controld_globals.transition_graph->stonith_timeout
932  	                             / 1000);
933  	
934  	    if (crmd_join_phase_count(crm_join_confirmed) == 1) {
935  	        stonith__set_call_options(options, target, st_opt_allow_suicide);
936  	    }
937  	    return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
938  	                                               type, timeout_sec, 0, delay);
939  	}
940  	
941  	/*!
942  	 * \internal
943  	 * \brief Execute a fencing action from a transition graph
944  	 *
945  	 * \param[in] graph   Transition graph being executed (ignored)
946  	 * \param[in] action  Fencing action to execute
947  	 *
948  	 * \return Standard Pacemaker return code
949  	 */
950  	int
951  	controld_execute_fence_action(pcmk__graph_t *graph,
952  	                              pcmk__graph_action_t *action)
953  	{
954  	    int rc = 0;
955  	    const char *id = ID(action->xml);
956  	    const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
957  	    const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
958  	    const char *type = crm_meta_value(action->params, "stonith_action");
959  	    char *transition_key = NULL;
960  	    const char *priority_delay = NULL;
961  	    int delay_i = 0;
962  	    gboolean invalid_action = FALSE;
963  	    int stonith_timeout = (int) (controld_globals.transition_graph->stonith_timeout
964  	                                 / 1000);
965  	
966  	    CRM_CHECK(id != NULL, invalid_action = TRUE);
967  	    CRM_CHECK(uuid != NULL, invalid_action = TRUE);
968  	    CRM_CHECK(type != NULL, invalid_action = TRUE);
969  	    CRM_CHECK(target != NULL, invalid_action = TRUE);
970  	
971  	    if (invalid_action) {
972  	        crm_log_xml_warn(action->xml, "BadAction");
973  	        return EPROTO;
974  	    }
975  	
976  	    priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
977  	
978  	    crm_notice("Requesting fencing (%s) targeting node %s "
979  	               CRM_XS " action=%s timeout=%i%s%s",
980  	               type, target, id, stonith_timeout,
981  	               priority_delay ? " priority_delay=" : "",
982  	               priority_delay ? priority_delay : "");
983  	
984  	    /* Passing NULL means block until we can connect... */
985  	    controld_timer_fencer_connect(NULL);
986  	
987  	    pcmk__scan_min_int(priority_delay, &delay_i, 0);
988  	    rc = fence_with_delay(target, type, delay_i);
989  	    transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
990  	                                          action->id, 0,
991  	                                          controld_globals.te_uuid),
992  	    stonith_api->cmds->register_callback(stonith_api, rc,
993  	                                         (stonith_timeout
994  	                                          + (delay_i > 0 ? delay_i : 0)),
995  	                                         st_opt_timeout_updates, transition_key,
996  	                                         "tengine_stonith_callback",
997  	                                         tengine_stonith_callback);
998  	    return pcmk_rc_ok;
999  	}
1000 	
1001 	bool
1002 	controld_verify_stonith_watchdog_timeout(const char *value)
1003 	{
1004 	    long st_timeout = value? crm_get_msec(value) : 0;
1005 	    const char *our_nodename = controld_globals.our_nodename;
1006 	    gboolean rv = TRUE;
1007 	
1008 	    if (st_timeout == 0
1009 	        || (stonith_api && (stonith_api->state != stonith_disconnected) &&
1010 	            stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
1011 	                                                           our_nodename))) {
1012 	        rv = pcmk__valid_sbd_timeout(value);
1013 	    }
1014 	    return rv;
1015 	}
1016 	
1017 	/* end stonith API client functions */
1018 	
1019 	
1020 	/*
1021 	 * stonith history synchronization
1022 	 *
1023 	 * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1024 	 * joins or leaves, we need to synchronize the history across all nodes.
1025 	 */
1026 	
1027 	static crm_trigger_t *stonith_history_sync_trigger = NULL;
1028 	static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
1029 	static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
1030 	
1031 	void
1032 	te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
1033 	{
1034 	    if (free_timers) {
1035 	        mainloop_timer_del(stonith_history_sync_timer_short);
1036 	        stonith_history_sync_timer_short = NULL;
1037 	        mainloop_timer_del(stonith_history_sync_timer_long);
1038 	        stonith_history_sync_timer_long = NULL;
1039 	    } else {
1040 	        mainloop_timer_stop(stonith_history_sync_timer_short);
1041 	        mainloop_timer_stop(stonith_history_sync_timer_long);
1042 	    }
1043 	
1044 	    if (st) {
1045 	        st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
1046 	    }
1047 	}
1048 	
1049 	static void
1050 	tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
1051 	{
1052 	    te_cleanup_stonith_history_sync(st, FALSE);
1053 	    crm_debug("Fence-history synced - cancel all timers");
1054 	}
1055 	
1056 	static gboolean
1057 	stonith_history_sync_set_trigger(gpointer user_data)
1058 	{
1059 	    mainloop_set_trigger(stonith_history_sync_trigger);
1060 	    return FALSE;
1061 	}
1062 	
1063 	void
1064 	te_trigger_stonith_history_sync(bool long_timeout)
1065 	{
1066 	    /* trigger a sync in 5s to give more nodes the
1067 	     * chance to show up so that we don't create
1068 	     * unnecessary stonith-history-sync traffic
1069 	     *
1070 	     * the long timeout of 30s is there as a fallback
1071 	     * so that after a successful connection to fenced
1072 	     * we will wait for 30s for the DC to trigger a
1073 	     * history-sync
1074 	     * if this doesn't happen we trigger a sync locally
1075 	     * (e.g. fenced segfaults and is restarted by pacemakerd)
1076 	     */
1077 	
1078 	    /* as we are finally checking the stonith-connection
1079 	     * in do_stonith_history_sync we should be fine
1080 	     * leaving stonith_history_sync_time & stonith_history_sync_trigger
1081 	     * around
1082 	     */
1083 	    if (stonith_history_sync_trigger == NULL) {
1084 	        stonith_history_sync_trigger =
1085 	            mainloop_add_trigger(G_PRIORITY_LOW,
1086 	                                 do_stonith_history_sync, NULL);
1087 	    }
1088 	
1089 	    if (long_timeout) {
1090 	        if(stonith_history_sync_timer_long == NULL) {
1091 	            stonith_history_sync_timer_long =
1092 	                mainloop_timer_add("history_sync_long", 30000,
1093 	                                   FALSE, stonith_history_sync_set_trigger,
1094 	                                   NULL);
1095 	        }
1096 	        crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1097 	        mainloop_timer_start(stonith_history_sync_timer_long);
1098 	    } else {
1099 	        if(stonith_history_sync_timer_short == NULL) {
1100 	            stonith_history_sync_timer_short =
1101 	                mainloop_timer_add("history_sync_short", 5000,
1102 	                                   FALSE, stonith_history_sync_set_trigger,
1103 	                                   NULL);
1104 	        }
1105 	        crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1106 	        mainloop_timer_start(stonith_history_sync_timer_short);
1107 	    }
1108 	
1109 	}
1110 	
1111 	/* end stonith history synchronization functions */
1112