1    	/*
2    	 * Copyright 2013-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <stdbool.h>
13   	#include <stdint.h>                    // UINT32_C
14   	
15   	#include <crm/crm.h>
16   	#include <crm/common/xml.h>
17   	#include <crm/lrmd.h>
18   	#include <crm/lrmd_internal.h>
19   	#include <crm/services.h>
20   	
21   	#include <libxml/xpath.h>               // xmlXPathObject, etc.
22   	
23   	#include <pacemaker-controld.h>
24   	
25   	#define REMOTE_LRMD_RA "remote"
26   	
27   	/* The max start timeout before cmd retry */
28   	#define MAX_START_TIMEOUT_MS 10000
29   	
30   	#define cmd_set_flags(cmd, flags_to_set) do { \
31   	    (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
32   	                                       "Remote command", (cmd)->rsc_id, (cmd)->status, \
33   	                                       (flags_to_set), #flags_to_set); \
34   	        } while (0)
35   	
36   	#define cmd_clear_flags(cmd, flags_to_clear) do { \
37   	    (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
38   	                                         "Remote command", (cmd)->rsc_id, (cmd)->status, \
39   	                                         (flags_to_clear), #flags_to_clear); \
40   	        } while (0)
41   	
42   	enum remote_cmd_status {
43   	    cmd_reported_success    = (UINT32_C(1) << 0),
44   	    cmd_cancel              = (UINT32_C(1) << 1),
45   	};
46   	
47   	#define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
48   	    lrm_state_t *lrm = (lrm_state); \
49   	    remote_ra_data_t *ra = lrm->remote_ra_data; \
50   	    ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
51   	                                    lrm->node_name, ra->status, \
52   	                                    (flags_to_set), #flags_to_set); \
53   	        } while (0)
54   	
55   	#define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
56   	    lrm_state_t *lrm = (lrm_state); \
57   	    remote_ra_data_t *ra = lrm->remote_ra_data; \
58   	    ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
59   	                                      lrm->node_name, ra->status, \
60   	                                      (flags_to_clear), #flags_to_clear); \
61   	        } while (0)
62   	
63   	enum remote_status {
64   	    expect_takeover     = (UINT32_C(1) << 0),
65   	    takeover_complete   = (UINT32_C(1) << 1),
66   	    remote_active       = (UINT32_C(1) << 2),
67   	    /* Maintenance mode is difficult to determine from the controller's context,
68   	     * so we have it signalled back with the transition from the scheduler.
69   	     */
70   	    remote_in_maint     = (UINT32_C(1) << 3),
71   	    /* Similar for whether we are controlling a guest node or remote node.
72   	     * Fortunately there is a meta-attribute in the transition already and
73   	     * as the situation doesn't change over time we can use the
74   	     * resource start for noting down the information for later use when
75   	     * the attributes aren't at hand.
76   	     */
77   	    controlling_guest   = (UINT32_C(1) << 4),
78   	};
79   	
80   	static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
81   	static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
82   	static GList *fail_all_monitor_cmds(GList * list);
83   	
84   	static void
85   	free_cmd(gpointer user_data)
86   	{
87   	    remote_ra_cmd_t *cmd = user_data;
88   	
89   	    if (!cmd) {
90   	        return;
91   	    }
92   	    if (cmd->delay_id) {
93   	        g_source_remove(cmd->delay_id);
94   	    }
95   	    if (cmd->interval_id) {
96   	        g_source_remove(cmd->interval_id);
97   	    }
98   	    if (cmd->monitor_timeout_id) {
99   	        g_source_remove(cmd->monitor_timeout_id);
100  	    }
101  	    if (cmd->takeover_timeout_id) {
102  	        g_source_remove(cmd->takeover_timeout_id);
103  	    }
104  	    free(cmd->owner);
105  	    free(cmd->rsc_id);
106  	    free(cmd->action);
107  	    free(cmd->userdata);
108  	    pcmk__reset_result(&(cmd->result));
109  	    lrmd_key_value_freeall(cmd->params);
110  	    free(cmd);
111  	}
112  	
113  	static int
114  	generate_callid(void)
115  	{
116  	    static int remote_ra_callid = 0;
117  	
118  	    remote_ra_callid++;
119  	    if (remote_ra_callid <= 0) {
120  	        remote_ra_callid = 1;
121  	    }
122  	
123  	    return remote_ra_callid;
124  	}
125  	
126  	static gboolean
127  	recurring_helper(gpointer data)
128  	{
129  	    remote_ra_cmd_t *cmd = data;
130  	    lrm_state_t *connection_rsc = NULL;
131  	
132  	    cmd->interval_id = 0;
133  	    connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
134  	    if (connection_rsc && connection_rsc->remote_ra_data) {
135  	        remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
136  	
137  	        ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
138  	
139  	        ra_data->cmds = g_list_append(ra_data->cmds, cmd);
140  	        mainloop_set_trigger(ra_data->work);
141  	    }
142  	    return FALSE;
143  	}
144  	
145  	static gboolean
146  	start_delay_helper(gpointer data)
147  	{
148  	    remote_ra_cmd_t *cmd = data;
149  	    lrm_state_t *connection_rsc = NULL;
150  	
151  	    cmd->delay_id = 0;
152  	    connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
153  	    if (connection_rsc && connection_rsc->remote_ra_data) {
154  	        remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
155  	
156  	        mainloop_set_trigger(ra_data->work);
157  	    }
158  	    return FALSE;
159  	}
160  	
161  	static bool
162  	should_purge_attributes(pcmk__node_status_t *node)
163  	{
164  	    pcmk__node_status_t *conn_node = NULL;
165  	    lrm_state_t *connection_rsc = NULL;
166  	
167  	    if ((node->conn_host == NULL) || (node->name == NULL)) {
168  	        return true;
169  	    }
170  	
171  	    /* Get the node that was hosting the remote connection resource from the
172  	     * peer cache.  That's the one we really care about here.
173  	     */
174  	    conn_node = pcmk__get_node(0, node->conn_host, NULL,
175  	                               pcmk__node_search_cluster_member);
176  	    if (conn_node == NULL) {
177  	        return true;
178  	    }
179  	
180  	    /* Check the uptime of connection_rsc.  If it hasn't been running long
181  	     * enough, set purge=true.  "Long enough" means it started running earlier
182  	     * than the timestamp when we noticed it went away in the first place.
183  	     */
184  	    connection_rsc = controld_get_executor_state(node->name, false);
185  	
186  	    if (connection_rsc != NULL) {
187  	        time_t uptime = lrmd__uptime(connection_rsc->conn);
188  	        time_t now = time(NULL);
189  	
190  	        /* Add 20s of fuzziness to give corosync a while to notice the remote
191  	         * host is gone.  On various error conditions (failure to get uptime,
192  	         * peer_lost isn't set) we default to purging.
193  	         */
194  	        if (uptime > 0 &&
195  	            conn_node->peer_lost > 0 &&
196  	            uptime + 20 >= now - conn_node->peer_lost) {
197  	            return false;
198  	        }
199  	    }
200  	
201  	    return true;
202  	}
203  	
204  	static void
205  	purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
206  	{
207  	    const bool unlocked_only = pcmk__is_set(controld_globals.flags,
208  	                                            controld_shutdown_lock_enabled);
209  	
210  	    // Purge node's transient attributes (from attribute manager and CIB)
211  	    if (should_purge_attributes(node)) {
212  	        controld_purge_node_attrs(node->name, true);
213  	    }
214  	
215  	    controld_delete_node_history(node->name, unlocked_only, call_opt);
216  	}
217  	
218  	/*!
219  	 * \internal
220  	 * \brief Handle cluster communication related to pacemaker_remote node joining
221  	 *
222  	 * \param[in] node_name  Name of newly integrated pacemaker_remote node
223  	 */
224  	static void
225  	remote_node_up(const char *node_name)
226  	{
227  	    int call_opt;
228  	    xmlNode *update, *state;
229  	    pcmk__node_status_t *node = NULL;
230  	    lrm_state_t *connection_rsc = NULL;
231  	
232  	    CRM_CHECK(node_name != NULL, return);
233  	    pcmk__info("Announcing Pacemaker Remote node %s", node_name);
234  	
235  	    call_opt = crmd_cib_smart_opt();
236  	
237  	    /* Delete node's CRM_OP_PROBED attribute. Deleting any attribute ensures
238  	     * that the attribute manager learns the node is remote. Deletion of this
239  	     * specfic attribute is a holdover from when it had special meaning.
240  	     *
241  	     * @COMPAT Find another way to tell attrd that the node is remote, without
242  	     * risking deletion or overwrite of an arbitrary attribute. Then work on
243  	     * deprecating CRM_OP_PROBED.
244  	     */
245  	    update_attrd(node_name, CRM_OP_PROBED, NULL, true);
246  	
247  	    /* Ensure node is in the remote peer cache with member status */
248  	    node = pcmk__cluster_lookup_remote_node(node_name);
249  	    CRM_CHECK((node != NULL) && (node->name != NULL), return);
250  	
251  	    purge_remote_node_attrs(call_opt, node);
252  	    pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
253  	
254  	    /* Apply any start state that we were given from the environment on the
255  	     * remote node.
256  	     */
257  	    connection_rsc = controld_get_executor_state(node->name, false);
258  	
259  	    if (connection_rsc != NULL) {
260  	        const char *start_state = lrmd__node_start_state(connection_rsc->conn);
261  	
262  	        if (start_state) {
263  	            set_join_state(start_state, node->name, node->xml_id, true);
264  	        }
265  	    }
266  	
267  	    /* pacemaker_remote nodes don't participate in the membership layer,
268  	     * so cluster nodes don't automatically get notified when they come and go.
269  	     * We send a cluster message to the DC, and update the CIB node state entry,
270  	     * so the DC will get it sooner (via message) or later (via CIB refresh),
271  	     * and any other interested parties can query the CIB.
272  	     */
273  	    broadcast_remote_state_message(node_name, true);
274  	
275  	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
276  	    state = create_node_state_update(node, controld_node_update_cluster, update,
277  	                                     __func__);
278  	
279  	    /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
280  	     * needs to be fenced, this flag will allow various actions to determine
281  	     * whether the fencing has happened yet.
282  	     */
283  	    pcmk__xe_set(state, PCMK__XA_NODE_FENCED, "0");
284  	
285  	    /* TODO: If the remote connection drops, and this (async) CIB update either
286  	     * failed or has not yet completed, later actions could mistakenly think the
287  	     * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
288  	     * previously set, because it won't have been cleared). This could prevent
289  	     * actual fencing or allow recurring monitor failures to be cleared too
290  	     * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
291  	     */
292  	    controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
293  	    pcmk__xml_free(update);
294  	}
295  	
296  	/*!
297  	 * \internal
298  	 * \brief Handle cluster communication related to pacemaker_remote node leaving
299  	 *
300  	 * \param[in] node_name  Name of lost node
301  	 * \param[in] erase_lrm  If \c true, erase the LRM history
302  	 */
303  	static void
304  	remote_node_down(const char *node_name, bool erase_lrm)
305  	{
306  	    xmlNode *update;
307  	    int call_opt = crmd_cib_smart_opt();
308  	    pcmk__node_status_t *node = NULL;
309  	
310  	    // Purge node's transient attributes (from attribute manager and CIB)
311  	    controld_purge_node_attrs(node_name, true);
312  	
313  	    /* Normally, the resource history should be kept until the node comes back
314  	     * up. However, after a successful fence, clear the history so we don't
315  	     * think resources are still running on the node.
316  	     */
317  	    if (erase_lrm) {
318  	        controld_delete_node_history(node_name, false, call_opt);
319  	    }
320  	
321  	    /* Ensure node is in the remote peer cache with lost state */
322  	    node = pcmk__cluster_lookup_remote_node(node_name);
323  	    CRM_CHECK(node != NULL, return);
324  	    pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
325  	
326  	    /* Notify DC */
327  	    broadcast_remote_state_message(node_name, false);
328  	
329  	    /* Update CIB node state */
330  	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
331  	    create_node_state_update(node, controld_node_update_cluster, update,
332  	                             __func__);
333  	    controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
334  	    pcmk__xml_free(update);
335  	}
336  	
337  	/*!
338  	 * \internal
339  	 * \brief Handle effects of a remote RA command on node state
340  	 *
341  	 * \param[in] cmd  Completed remote RA command
342  	 */
343  	static void
344  	check_remote_node_state(const remote_ra_cmd_t *cmd)
345  	{
346  	    /* Only successful actions can change node state */
347  	    if (!pcmk__result_ok(&(cmd->result))) {
348  	        return;
349  	    }
350  	
351  	    if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
352  	        remote_node_up(cmd->rsc_id);
353  	
354  	    } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
355  	                            pcmk__str_casei)) {
356  	        /* After a successful migration, we don't need to do remote_node_up()
357  	         * because the DC already knows the node is up, and we don't want to
358  	         * clear LRM history etc. We do need to add the remote node to this
359  	         * host's remote peer cache, because (unless it happens to be DC)
360  	         * it hasn't been tracking the remote node, and other code relies on
361  	         * the cache to distinguish remote nodes from unseen cluster nodes.
362  	         */
363  	        pcmk__node_status_t *node =
364  	            pcmk__cluster_lookup_remote_node(cmd->rsc_id);
365  	
366  	        CRM_CHECK(node != NULL, return);
367  	        pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
368  	
369  	    } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
370  	        lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
371  	                                                             false);
372  	        remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
373  	
374  	        if (ra_data) {
375  	            if (!pcmk__is_set(ra_data->status, takeover_complete)) {
376  	                /* Stop means down if we didn't successfully migrate elsewhere */
377  	                remote_node_down(cmd->rsc_id, false);
378  	            } else if (AM_I_DC == FALSE) {
379  	                /* Only the connection host and DC track node state,
380  	                 * so if the connection migrated elsewhere and we aren't DC,
381  	                 * un-cache the node, so we don't have stale info
382  	                 */
383  	                pcmk__cluster_forget_remote_node(cmd->rsc_id);
384  	            }
385  	        }
386  	    }
387  	
388  	    /* We don't do anything for successful monitors, which is correct for
389  	     * routine recurring monitors, and for monitors on nodes where the
390  	     * connection isn't supposed to be (the cluster will stop the connection in
391  	     * that case). However, if the initial probe finds the connection already
392  	     * active on the node where we want it, we probably should do
393  	     * remote_node_up(). Unfortunately, we can't distinguish that case here.
394  	     * Given that connections have to be initiated by the cluster, the chance of
395  	     * that should be close to zero.
396  	     */
397  	}
398  	
399  	static void
400  	report_remote_ra_result(remote_ra_cmd_t * cmd)
401  	{
402  	    lrmd_event_data_t op = { 0, };
403  	
404  	    check_remote_node_state(cmd);
405  	
406  	    op.type = lrmd_event_exec_complete;
407  	    op.rsc_id = cmd->rsc_id;
408  	    op.op_type = cmd->action;
409  	    op.user_data = cmd->userdata;
410  	    op.timeout = cmd->timeout;
411  	    op.interval_ms = cmd->interval_ms;
412  	    op.t_run = cmd->start_time;
413  	    op.t_rcchange = cmd->start_time;
414  	
415  	    lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
416  	                     cmd->result.exit_reason);
417  	
418  	    if (pcmk__is_set(cmd->status, cmd_reported_success)
419  	        && !pcmk__result_ok(&(cmd->result))) {
420  	
421  	        op.t_rcchange = time(NULL);
422  	        /* This edge case will likely never ever occur, but if it does the
423  	         * result is that a failure will not be processed correctly. This is only
424  	         * remotely possible because we are able to detect a connection resource's tcp
425  	         * connection has failed at any moment after start has completed. The actual
426  	         * recurring operation is just a connectivity ping.
427  	         *
428  	         * basically, we are not guaranteed that the first successful monitor op and
429  	         * a subsequent failed monitor op will not occur in the same timestamp. We have to
430  	         * make it look like the operations occurred at separate times though. */
431  	        if (op.t_rcchange == op.t_run) {
432  	            op.t_rcchange++;
433  	        }
434  	    }
435  	
436  	    if (cmd->params) {
437  	        lrmd_key_value_t *tmp;
438  	
439  	        op.params = pcmk__strkey_table(free, free);
440  	        for (tmp = cmd->params; tmp; tmp = tmp->next) {
441  	            pcmk__insert_dup(op.params, tmp->key, tmp->value);
442  	        }
443  	
444  	    }
445  	    op.call_id = cmd->call_id;
446  	    op.remote_nodename = cmd->owner;
447  	
448  	    lrm_op_callback(&op);
449  	
450  	    g_clear_pointer(&op.params, g_hash_table_destroy);
451  	    lrmd__reset_result(&op);
452  	}
453  	
454  	/*!
455  	 * \internal
456  	 * \brief Return a remote command's remaining timeout in seconds
457  	 *
458  	 * \param[in] cmd  Remote command to check
459  	 *
460  	 * \return Command's remaining timeout in seconds
461  	 */
462  	static int
463  	remaining_timeout_sec(const remote_ra_cmd_t *cmd)
464  	{
465  	    return pcmk__timeout_ms2s(cmd->timeout) - (time(NULL) - cmd->start_time);
466  	}
467  	
468  	static gboolean
469  	retry_start_cmd_cb(gpointer data)
470  	{
471  	    lrm_state_t *lrm_state = data;
472  	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
473  	    remote_ra_cmd_t *cmd = NULL;
474  	    int rc = ETIME;
475  	    int remaining = 0;
476  	
477  	    if (!ra_data || !ra_data->cur_cmd) {
478  	        return FALSE;
479  	    }
480  	    cmd = ra_data->cur_cmd;
481  	    if (!pcmk__is_up_action(cmd->action)) {
482  	        return FALSE;
483  	    }
484  	
485  	    remaining = remaining_timeout_sec(cmd);
486  	    if (remaining > 0) {
487  	        rc = handle_remote_ra_start(lrm_state, cmd, remaining * 1000);
488  	    } else {
489  	        pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
490  	                         PCMK_EXEC_TIMEOUT,
491  	                         "Not enough time remains to retry remote connection");
492  	    }
493  	
494  	    if (rc != pcmk_rc_ok) {
495  	        report_remote_ra_result(cmd);
496  	
497  	        if (ra_data->cmds) {
498  	            mainloop_set_trigger(ra_data->work);
499  	        }
500  	        ra_data->cur_cmd = NULL;
501  	        free_cmd(cmd);
502  	    } else {
503  	        /* wait for connection event */
504  	    }
505  	
506  	    return FALSE;
507  	}
508  	
509  	
510  	static gboolean
511  	connection_takeover_timeout_cb(gpointer data)
512  	{
513  	    lrm_state_t *lrm_state = NULL;
514  	    remote_ra_cmd_t *cmd = data;
515  	
516  	    pcmk__info("takeover event timed out for node %s", cmd->rsc_id);
517  	    cmd->takeover_timeout_id = 0;
518  	
519  	    lrm_state = controld_get_executor_state(cmd->rsc_id, false);
520  	
521  	    handle_remote_ra_stop(lrm_state, cmd);
522  	    free_cmd(cmd);
523  	
524  	    return FALSE;
525  	}
526  	
527  	static gboolean
528  	monitor_timeout_cb(gpointer data)
529  	{
530  	    lrm_state_t *lrm_state = NULL;
531  	    remote_ra_cmd_t *cmd = data;
532  	
533  	    lrm_state = controld_get_executor_state(cmd->rsc_id, false);
534  	
535  	    pcmk__info("Timed out waiting for remote poke response from %s%s",
536  	               cmd->rsc_id, ((lrm_state != NULL)? "" : " (no LRM state)"));
537  	    cmd->monitor_timeout_id = 0;
538  	    pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
539  	                     "Remote executor did not respond");
540  	
541  	    if (lrm_state && lrm_state->remote_ra_data) {
542  	        remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
543  	
544  	        if (ra_data->cur_cmd == cmd) {
545  	            ra_data->cur_cmd = NULL;
546  	        }
547  	        if (ra_data->cmds) {
548  	            mainloop_set_trigger(ra_data->work);
549  	        }
550  	    }
551  	
552  	    report_remote_ra_result(cmd);
553  	    free_cmd(cmd);
554  	
555  	    if(lrm_state) {
556  	        // @TODO Should we move this before reporting the result above?
557  	        lrm_state_disconnect(lrm_state);
558  	    }
559  	    return FALSE;
560  	}
561  	
562  	static void
563  	synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
564  	{
565  	    lrmd_event_data_t op = { 0, };
566  	
567  	    if (lrm_state == NULL) {
568  	        /* if lrm_state not given assume local */
569  	        lrm_state = controld_get_executor_state(NULL, false);
570  	    }
571  	    pcmk__assert(lrm_state != NULL);
572  	
573  	    op.type = lrmd_event_exec_complete;
574  	    op.rsc_id = rsc_id;
575  	    op.op_type = op_type;
576  	    op.t_run = time(NULL);
577  	    op.t_rcchange = op.t_run;
578  	    op.call_id = generate_callid();
579  	    lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
580  	    process_lrm_event(lrm_state, &op, NULL, NULL);
581  	}
582  	
583  	void
584  	remote_lrm_op_callback(lrmd_event_data_t * op)
585  	{
586  	    gboolean cmd_handled = FALSE;
587  	    lrm_state_t *lrm_state = NULL;
588  	    remote_ra_data_t *ra_data = NULL;
589  	    remote_ra_cmd_t *cmd = NULL;
590  	
591  	    CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
592  	
593  	    pcmk__debug("Processing '%s%s%s' event on remote connection to %s: %s "
594  	                "(%d) status=%s (%d)",
595  	                pcmk__s(op->op_type, ""), ((op->op_type != NULL)? " " : ""),
596  	                lrmd_event_type2str(op->type), op->remote_nodename,
597  	                crm_exit_str((crm_exit_t) op->rc), op->rc,
598  	                pcmk_exec_status_str(op->op_status), op->op_status);
599  	
600  	    lrm_state = controld_get_executor_state(op->remote_nodename, false);
601  	    if (!lrm_state || !lrm_state->remote_ra_data) {
602  	        pcmk__debug("No state information found for remote connection event");
603  	        return;
604  	    }
605  	    ra_data = lrm_state->remote_ra_data;
606  	
607  	    if (op->type == lrmd_event_new_client) {
608  	        // Another client has connected to the remote daemon
609  	
610  	        if (pcmk__is_set(ra_data->status, expect_takeover)) {
611  	            // Great, we knew this was coming
612  	            lrm_remote_clear_flags(lrm_state, expect_takeover);
613  	            lrm_remote_set_flags(lrm_state, takeover_complete);
614  	
615  	        } else {
616  	            pcmk__err("Disconnecting from Pacemaker Remote node %s due to "
617  	                      "unexpected client takeover",
618  	                      op->remote_nodename);
619  	            /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
620  	            /* Do not free lrm_state->conn yet. */
621  	            /* It'll be freed in the following stop action. */
622  	            lrm_state_disconnect_only(lrm_state);
623  	        }
624  	        return;
625  	    }
626  	
627  	    /* filter all EXEC events up */
628  	    if (op->type == lrmd_event_exec_complete) {
629  	        if (pcmk__is_set(ra_data->status, takeover_complete)) {
630  	            pcmk__debug("Ignoring event, this connection is taken over by "
631  	                        "another node");
632  	        } else {
633  	            lrm_op_callback(op);
634  	        }
635  	        return;
636  	    }
637  	
638  	    if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
639  	
640  	        if (!pcmk__is_set(ra_data->status, remote_active)) {
641  	            pcmk__debug("Disconnection from Pacemaker Remote node %s complete",
642  	                        lrm_state->node_name);
643  	
644  	        } else if (!remote_ra_is_in_maintenance(lrm_state)) {
645  	            pcmk__err("Lost connection to Pacemaker Remote node %s",
646  	                      lrm_state->node_name);
647  	            ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
648  	            ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
649  	
650  	        } else {
651  	            pcmk__notice("Unmanaged Pacemaker Remote node %s disconnected",
652  	                         lrm_state->node_name);
653  	            /* Do roughly what a 'stop' on the remote-resource would do */
654  	            handle_remote_ra_stop(lrm_state, NULL);
655  	            remote_node_down(lrm_state->node_name, false);
656  	            /* now fake the reply of a successful 'stop' */
657  	            synthesize_lrmd_success(NULL, lrm_state->node_name,
658  	                                    PCMK_ACTION_STOP);
659  	        }
660  	        return;
661  	    }
662  	
663  	    if (!ra_data->cur_cmd) {
664  	        pcmk__debug("No event to match");
665  	        return;
666  	    }
667  	
668  	    cmd = ra_data->cur_cmd;
669  	
670  	    /* Start actions and migrate from actions complete after connection
671  	     * comes back to us. */
672  	    if ((op->type == lrmd_event_connect) && pcmk__is_up_action(cmd->action)) {
673  	        if (op->connection_rc < 0) {
674  	            int remaining = remaining_timeout_sec(cmd);
675  	
676  	            if ((op->connection_rc == -ENOKEY)
677  	                || (op->connection_rc == -EKEYREJECTED)) {
678  	                // Hard error, don't retry
679  	                pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
680  	                                 PCMK_EXEC_ERROR,
681  	                                 pcmk_strerror(op->connection_rc));
682  	
683  	            } else if (remaining > 3) {
684  	                pcmk__trace("Rescheduling start (%ds remains before timeout)",
685  	                            remaining);
686  	                pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
687  	                return;
688  	
689  	            } else {
690  	                pcmk__trace("Not enough time before timeout (%ds) to "
691  	                            "reschedule start",
692  	                            remaining);
693  	                pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
694  	                                    PCMK_EXEC_TIMEOUT,
695  	                                    "%s without enough time to retry",
696  	                                    pcmk_strerror(op->connection_rc));
697  	            }
698  	
699  	        } else {
700  	            lrm_state_reset_tables(lrm_state, TRUE);
701  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
702  	            lrm_remote_set_flags(lrm_state, remote_active);
703  	        }
704  	
705  	        pcmk__debug("Remote connection event matched %s action", cmd->action);
706  	        report_remote_ra_result(cmd);
707  	        cmd_handled = TRUE;
708  	
709  	    } else if ((op->type == lrmd_event_poke)
710  	               && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
711  	                               pcmk__str_casei)) {
712  	
713  	        if (cmd->monitor_timeout_id) {
714  	            g_source_remove(cmd->monitor_timeout_id);
715  	            cmd->monitor_timeout_id = 0;
716  	        }
717  	
718  	        /* Only report success the first time, after that only worry about failures.
719  	         * For this function, if we get the poke pack, it is always a success. Pokes
720  	         * only fail if the send fails, or the response times out. */
721  	        if (!pcmk__is_set(cmd->status, cmd_reported_success)) {
722  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
723  	            report_remote_ra_result(cmd);
724  	            cmd_set_flags(cmd, cmd_reported_success);
725  	        }
726  	
727  	        pcmk__debug("Remote poke event matched %s action", cmd->action);
728  	
729  	        /* success, keep rescheduling if interval is present. */
730  	        if (cmd->interval_ms && !pcmk__is_set(cmd->status, cmd_cancel)) {
731  	            ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
732  	            cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
733  	                                                  recurring_helper, cmd);
734  	            cmd = NULL;         /* prevent free */
735  	        }
736  	        cmd_handled = TRUE;
737  	
738  	    } else if ((op->type == lrmd_event_disconnect)
739  	               && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
740  	                               pcmk__str_casei)) {
741  	        if (pcmk__is_set(ra_data->status, remote_active)
742  	            && !pcmk__is_set(cmd->status, cmd_cancel)) {
743  	
744  	            pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
745  	                             PCMK_EXEC_ERROR,
746  	                             "Remote connection unexpectedly dropped "
747  	                             "during monitor");
748  	            report_remote_ra_result(cmd);
749  	            pcmk__err("Remote connection to %s unexpectedly dropped during "
750  	                      "monitor",
751  	                      lrm_state->node_name);
752  	        }
753  	        cmd_handled = TRUE;
754  	
755  	    } else {
756  	        pcmk__debug("Event did not match %s action", ra_data->cur_cmd->action);
757  	    }
758  	
759  	    if (cmd_handled) {
760  	        ra_data->cur_cmd = NULL;
761  	        if (ra_data->cmds) {
762  	            mainloop_set_trigger(ra_data->work);
763  	        }
764  	        free_cmd(cmd);
765  	    }
766  	}
767  	
768  	static void
769  	handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
770  	{
771  	    pcmk__assert(lrm_state != NULL);
772  	
773  	    if (!pcmk__is_set(lrm_state->remote_ra_data->status, takeover_complete)) {
774  	        /* delete pending ops when ever the remote connection is intentionally stopped */
775  	        g_hash_table_remove_all(lrm_state->active_ops);
776  	    } else {
777  	        /* we no longer hold the history if this connection has been migrated,
778  	         * however, we keep metadata cache for future use */
779  	        lrm_state_reset_tables(lrm_state, FALSE);
780  	    }
781  	
782  	    lrm_remote_clear_flags(lrm_state, remote_active);
783  	    lrm_state_disconnect(lrm_state);
784  	
785  	    g_list_free_full(lrm_state->remote_ra_data->cmds, free_cmd);
786  	    lrm_state->remote_ra_data->cmds = NULL;
787  	
788  	    g_list_free_full(lrm_state->remote_ra_data->recurring_cmds, free_cmd);
789  	    lrm_state->remote_ra_data->recurring_cmds = NULL;
790  	
791  	    lrm_state->remote_ra_data->cur_cmd = NULL;
792  	
793  	    if (cmd) {
794  	        pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
795  	        report_remote_ra_result(cmd);
796  	    }
797  	}
798  	
799  	// \return Standard Pacemaker return code
800  	static int
801  	handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
802  	{
803  	    const char *server = NULL;
804  	    lrmd_key_value_t *tmp = NULL;
805  	    int port = 0;
806  	    int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
807  	    int rc = pcmk_rc_ok;
808  	
809  	    for (tmp = cmd->params; tmp; tmp = tmp->next) {
810  	        if (pcmk__strcase_any_of(tmp->key,
811  	                                 PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
812  	                                 NULL)) {
813  	            server = tmp->value;
814  	
815  	        } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
816  	                                pcmk__str_none)) {
817  	            port = atoi(tmp->value);
818  	
819  	        } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
820  	                                pcmk__str_none)) {
821  	            lrm_remote_set_flags(lrm_state, controlling_guest);
822  	        }
823  	    }
824  	
825  	    rc = controld_connect_remote_executor(lrm_state, server, port,
826  	                                          timeout_used);
827  	    if (rc != pcmk_rc_ok) {
828  	        pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
829  	                            PCMK_EXEC_ERROR,
830  	                            "Could not connect to Pacemaker Remote node %s: %s",
831  	                            lrm_state->node_name, pcmk_rc_str(rc));
832  	    }
833  	    return rc;
834  	}
835  	
836  	static gboolean
837  	handle_remote_ra_exec(gpointer user_data)
838  	{
839  	    int rc = 0;
840  	    lrm_state_t *lrm_state = user_data;
841  	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
842  	    remote_ra_cmd_t *cmd;
843  	    GList *first = NULL;
844  	
845  	    if (ra_data->cur_cmd) {
846  	        /* still waiting on previous cmd */
847  	        return TRUE;
848  	    }
849  	
850  	    while (ra_data->cmds) {
851  	        first = ra_data->cmds;
852  	        cmd = first->data;
853  	        if (cmd->delay_id) {
854  	            /* still waiting for start delay timer to trip */
855  	            return TRUE;
856  	        }
857  	
858  	        ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
859  	        g_list_free_1(first);
860  	
861  	        if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
862  	                             PCMK_ACTION_MIGRATE_FROM, NULL)) {
863  	            lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
864  	            if (handle_remote_ra_start(lrm_state, cmd,
865  	                                       cmd->timeout) == pcmk_rc_ok) {
866  	                /* take care of this later when we get async connection result */
867  	                pcmk__debug("Initiated async remote connection, %s action will "
868  	                            "complete after connect event",
869  	                            cmd->action);
870  	                ra_data->cur_cmd = cmd;
871  	                return TRUE;
872  	            }
873  	            report_remote_ra_result(cmd);
874  	
875  	        } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
876  	
877  	            if (lrm_state_is_connected(lrm_state) == TRUE) {
878  	                rc = lrm_state_poke_connection(lrm_state);
879  	                if (rc < 0) {
880  	                    pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
881  	                                     PCMK_EXEC_ERROR, pcmk_strerror(rc));
882  	                }
883  	            } else {
884  	                rc = -1;
885  	                pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
886  	                                 PCMK_EXEC_DONE, "Remote connection inactive");
887  	            }
888  	
889  	            if (rc == 0) {
890  	                pcmk__debug("Poked Pacemaker Remote at node %s, waiting for "
891  	                            "async response",
892  	                            cmd->rsc_id);
893  	                ra_data->cur_cmd = cmd;
894  	                cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
895  	                return TRUE;
896  	            }
897  	            report_remote_ra_result(cmd);
898  	
899  	        } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
900  	
901  	            if (pcmk__is_set(ra_data->status, expect_takeover)) {
902  	                /* Briefly wait on stop for an expected takeover to occur. If
903  	                 * the takeover does not occur during the wait, that's fine; it
904  	                 * just means that the remote node's resource history will be
905  	                 * cleared, which will require probing all resources on the
906  	                 * remote node. If the takeover does occur successfully, then we
907  	                 * can leave the status section intact.
908  	                 */
909  	                cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
910  	                                                              connection_takeover_timeout_cb,
911  	                                                              cmd);
912  	                ra_data->cur_cmd = cmd;
913  	                return TRUE;
914  	            }
915  	
916  	            handle_remote_ra_stop(lrm_state, cmd);
917  	
918  	        } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
919  	            lrm_remote_clear_flags(lrm_state, takeover_complete);
920  	            lrm_remote_set_flags(lrm_state, expect_takeover);
921  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
922  	            report_remote_ra_result(cmd);
923  	
924  	        } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
925  	                                    PCMK_ACTION_RELOAD_AGENT, NULL))  {
926  	            /* Currently the only reloadable parameter is
927  	             * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
928  	             * scheduler via the CIB, so reloads are a no-op.
929  	             *
930  	             * @COMPAT DC <2.1.0: We only need to check for "reload" in case
931  	             * we're in a rolling upgrade with a DC scheduling "reload" instead
932  	             * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
933  	             * so this would work for that purpose as well.
934  	             */
935  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
936  	            report_remote_ra_result(cmd);
937  	        }
938  	
939  	        free_cmd(cmd);
940  	    }
941  	
942  	    return TRUE;
943  	}
944  	
945  	static void
946  	remote_ra_data_init(lrm_state_t * lrm_state)
947  	{
948  	    remote_ra_data_t *ra_data = NULL;
949  	
950  	    if (lrm_state->remote_ra_data) {
951  	        return;
952  	    }
953  	
954  	    ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
955  	    ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
956  	    lrm_state->remote_ra_data = ra_data;
957  	}
958  	
959  	void
960  	remote_ra_cleanup(lrm_state_t * lrm_state)
961  	{
(1) Event path: Condition "lrm_state->remote_ra_data == NULL", taking false branch.
962  	    if (lrm_state->remote_ra_data == NULL) {
963  	        return;
964  	    }
965  	
966  	    g_list_free_full(lrm_state->remote_ra_data->cmds, free_cmd);
967  	    g_list_free_full(lrm_state->remote_ra_data->recurring_cmds, free_cmd);
968  	    mainloop_destroy_trigger(lrm_state->remote_ra_data->work);
CID (unavailable; MK=b3d91fdb66bde8c1fec6247bf82d8f84) (#1 of 1): Inconsistent C union access (INCONSISTENT_UNION_ACCESS):
(2) Event assign_union_field: The union field "in" of "_pp" is written.
(3) Event inconsistent_union_field_access: In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in".
969  	    g_clear_pointer(&lrm_state->remote_ra_data, free);
970  	}
971  	
972  	gboolean
973  	is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
974  	{
975  	    if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
976  	        return TRUE;
977  	    }
978  	    return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
979  	           && !controld_is_local_node(id);
980  	}
981  	
982  	lrmd_rsc_info_t *
983  	remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
984  	{
985  	    lrmd_rsc_info_t *info = NULL;
986  	
987  	    CRM_CHECK(rsc_id != NULL, return NULL);
988  	
989  	    if (controld_get_executor_state(rsc_id, false) != NULL) {
990  	        info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
991  	
992  	        info->id = pcmk__str_copy(rsc_id);
993  	        info->type = pcmk__str_copy(REMOTE_LRMD_RA);
994  	        info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
995  	        info->provider = pcmk__str_copy("pacemaker");
996  	    }
997  	
998  	    return info;
999  	}
1000 	
1001 	static gboolean
1002 	is_remote_ra_supported_action(const char *action)
1003 	{
1004 	    return pcmk__str_any_of(action,
1005 	                            PCMK_ACTION_START,
1006 	                            PCMK_ACTION_STOP,
1007 	                            PCMK_ACTION_MONITOR,
1008 	                            PCMK_ACTION_MIGRATE_TO,
1009 	                            PCMK_ACTION_MIGRATE_FROM,
1010 	                            PCMK_ACTION_RELOAD_AGENT,
1011 	                            PCMK_ACTION_RELOAD,
1012 	                            NULL);
1013 	}
1014 	
1015 	static GList *
1016 	fail_all_monitor_cmds(GList * list)
1017 	{
1018 	    GList *rm_list = NULL;
1019 	    remote_ra_cmd_t *cmd = NULL;
1020 	    GList *gIter = NULL;
1021 	
1022 	    for (gIter = list; gIter != NULL; gIter = gIter->next) {
1023 	        cmd = gIter->data;
1024 	        if ((cmd->interval_ms > 0)
1025 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1026 	                            pcmk__str_casei)) {
1027 	            rm_list = g_list_append(rm_list, cmd);
1028 	        }
1029 	    }
1030 	
1031 	    for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1032 	        cmd = gIter->data;
1033 	
1034 	        pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1035 	                         PCMK_EXEC_ERROR, "Lost connection to remote executor");
1036 	        pcmk__trace("Pre-emptively failing %s %s (interval=%u, %s)",
1037 	                    cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1038 	        report_remote_ra_result(cmd);
1039 	
1040 	        list = g_list_remove(list, cmd);
1041 	        free_cmd(cmd);
1042 	    }
1043 	
1044 	    /* frees only the list data, not the cmds */
1045 	    g_list_free(rm_list);
1046 	    return list;
1047 	}
1048 	
1049 	static GList *
1050 	remove_cmd(GList * list, const char *action, guint interval_ms)
1051 	{
1052 	    remote_ra_cmd_t *cmd = NULL;
1053 	    GList *gIter = NULL;
1054 	
1055 	    for (gIter = list; gIter != NULL; gIter = gIter->next) {
1056 	        cmd = gIter->data;
1057 	        if ((cmd->interval_ms == interval_ms)
1058 	            && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1059 	            break;
1060 	        }
1061 	        cmd = NULL;
1062 	    }
1063 	    if (cmd) {
1064 	        list = g_list_remove(list, cmd);
1065 	        free_cmd(cmd);
1066 	    }
1067 	    return list;
1068 	}
1069 	
1070 	int
1071 	remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
1072 	                 const char *action, guint interval_ms)
1073 	{
1074 	    lrm_state_t *connection_rsc = NULL;
1075 	    remote_ra_data_t *ra_data = NULL;
1076 	
1077 	    CRM_CHECK(rsc_id != NULL, return -EINVAL);
1078 	
1079 	    connection_rsc = controld_get_executor_state(rsc_id, false);
1080 	    if (!connection_rsc || !connection_rsc->remote_ra_data) {
1081 	        return -EINVAL;
1082 	    }
1083 	
1084 	    ra_data = connection_rsc->remote_ra_data;
1085 	    ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1086 	    ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1087 	                                         interval_ms);
1088 	    if (ra_data->cur_cmd &&
1089 	        (ra_data->cur_cmd->interval_ms == interval_ms) &&
1090 	        (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1091 	
1092 	        cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1093 	    }
1094 	
1095 	    return 0;
1096 	}
1097 	
1098 	static remote_ra_cmd_t *
1099 	handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
1100 	                   const char *userdata)
1101 	{
1102 	    GList *gIter = NULL;
1103 	    remote_ra_cmd_t *cmd = NULL;
1104 	
1105 	    /* there are 3 places a potential duplicate monitor operation
1106 	     * could exist.
1107 	     * 1. recurring_cmds list. where the op is waiting for its next interval
1108 	     * 2. cmds list, where the op is queued to get executed immediately
1109 	     * 3. cur_cmd, which means the monitor op is in flight right now.
1110 	     */
1111 	    if (interval_ms == 0) {
1112 	        return NULL;
1113 	    }
1114 	
1115 	    if (ra_data->cur_cmd &&
1116 	        !pcmk__is_set(ra_data->cur_cmd->status, cmd_cancel)
1117 	        && (ra_data->cur_cmd->interval_ms == interval_ms)
1118 	        && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1119 	                        pcmk__str_casei)) {
1120 	
1121 	        cmd = ra_data->cur_cmd;
1122 	        goto handle_dup;
1123 	    }
1124 	
1125 	    for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1126 	        cmd = gIter->data;
1127 	        if ((cmd->interval_ms == interval_ms)
1128 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1129 	                            pcmk__str_casei)) {
1130 	            goto handle_dup;
1131 	        }
1132 	    }
1133 	
1134 	    for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1135 	        cmd = gIter->data;
1136 	        if ((cmd->interval_ms == interval_ms)
1137 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1138 	                            pcmk__str_casei)) {
1139 	            goto handle_dup;
1140 	        }
1141 	    }
1142 	
1143 	    return NULL;
1144 	
1145 	handle_dup:
1146 	
1147 	    pcmk__trace("merging duplicate monitor cmd " PCMK__OP_FMT, cmd->rsc_id,
1148 	                PCMK_ACTION_MONITOR, interval_ms);
1149 	
1150 	    /* update the userdata */
1151 	    if (userdata) {
1152 	       free(cmd->userdata);
1153 	       cmd->userdata = pcmk__str_copy(userdata);
1154 	    }
1155 	
1156 	    /* if we've already reported success, generate a new call id */
1157 	    if (pcmk__is_set(cmd->status, cmd_reported_success)) {
1158 	        cmd->start_time = time(NULL);
1159 	        cmd->call_id = generate_callid();
1160 	        cmd_clear_flags(cmd, cmd_reported_success);
1161 	    }
1162 	
1163 	    /* if we have an interval_id set, that means we are in the process of
1164 	     * waiting for this cmd's next interval. instead of waiting, cancel
1165 	     * the timer and execute the action immediately */
1166 	    if (cmd->interval_id) {
1167 	        g_source_remove(cmd->interval_id);
1168 	        cmd->interval_id = 0;
1169 	        recurring_helper(cmd);
1170 	    }
1171 	
1172 	    return cmd;
1173 	}
1174 	
1175 	/*!
1176 	 * \internal
1177 	 * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1178 	 *
1179 	 * \param[in]     lrm_state      Executor state object for remote connection
1180 	 * \param[in]     rsc_id         Connection resource ID
1181 	 * \param[in]     action         Action to execute
1182 	 * \param[in]     userdata       String to copy and pass to execution callback
1183 	 * \param[in]     interval_ms    Action interval (in milliseconds)
1184 	 * \param[in]     timeout_ms     Action timeout (in milliseconds)
1185 	 * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1186 	 * \param[in,out] params         Connection resource parameters
1187 	 * \param[out]    call_id        Where to store call ID on success
1188 	 *
1189 	 * \return Standard Pacemaker return code
1190 	 * \note This takes ownership of \p params, which should not be used or freed
1191 	 *       after calling this function.
1192 	 */
1193 	int
1194 	controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
1195 	                              const char *action, const char *userdata,
1196 	                              guint interval_ms, int timeout_ms,
1197 	                              int start_delay_ms, lrmd_key_value_t *params,
1198 	                              int *call_id)
1199 	{
1200 	    lrm_state_t *connection_rsc = NULL;
1201 	    remote_ra_cmd_t *cmd = NULL;
1202 	    remote_ra_data_t *ra_data = NULL;
1203 	
1204 	    *call_id = 0;
1205 	
1206 	    CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1207 	              && (userdata != NULL) && (call_id != NULL),
1208 	              lrmd_key_value_freeall(params); return EINVAL);
1209 	
1210 	    if (!is_remote_ra_supported_action(action)) {
1211 	        lrmd_key_value_freeall(params);
1212 	        return EOPNOTSUPP;
1213 	    }
1214 	
1215 	    connection_rsc = controld_get_executor_state(rsc_id, false);
1216 	    if (connection_rsc == NULL) {
1217 	        lrmd_key_value_freeall(params);
1218 	        return ENOTCONN;
1219 	    }
1220 	
1221 	    remote_ra_data_init(connection_rsc);
1222 	    ra_data = connection_rsc->remote_ra_data;
1223 	
1224 	    cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1225 	    if (cmd) {
1226 	        *call_id = cmd->call_id;
1227 	        lrmd_key_value_freeall(params);
1228 	        return pcmk_rc_ok;
1229 	    }
1230 	
1231 	    cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1232 	
1233 	    cmd->owner = pcmk__str_copy(lrm_state->node_name);
1234 	    cmd->rsc_id = pcmk__str_copy(rsc_id);
1235 	    cmd->action = pcmk__str_copy(action);
1236 	    cmd->userdata = pcmk__str_copy(userdata);
1237 	    cmd->interval_ms = interval_ms;
1238 	    cmd->timeout = timeout_ms;
1239 	    cmd->start_delay = start_delay_ms;
1240 	    cmd->params = params;
1241 	    cmd->start_time = time(NULL);
1242 	
1243 	    cmd->call_id = generate_callid();
1244 	
1245 	    if (cmd->start_delay) {
1246 	        cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1247 	    }
1248 	
1249 	    ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1250 	    mainloop_set_trigger(ra_data->work);
1251 	
1252 	    *call_id = cmd->call_id;
1253 	    return pcmk_rc_ok;
1254 	}
1255 	
1256 	/*!
1257 	 * \internal
1258 	 * \brief Immediately fail all monitors of a remote node, if proxied here
1259 	 *
1260 	 * \param[in] node_name  Name of pacemaker_remote node
1261 	 */
1262 	void
1263 	remote_ra_fail(const char *node_name)
1264 	{
1265 	    lrm_state_t *lrm_state = NULL;
1266 	
1267 	    CRM_CHECK(node_name != NULL, return);
1268 	
1269 	    lrm_state = controld_get_executor_state(node_name, false);
1270 	    if (lrm_state && lrm_state_is_connected(lrm_state)) {
1271 	        remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1272 	
1273 	        pcmk__info("Failing monitors on Pacemaker Remote node %s", node_name);
1274 	        ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1275 	        ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1276 	    }
1277 	}
1278 	
1279 	/* A guest node fencing implied by host fencing looks like:
1280 	 *
1281 	 *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1282 	 *                on_node="lxc1" on_node_uuid="lxc1">
1283 	 *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1284 	 *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1285 	 *     <downed>
1286 	 *       <node id="lxc1"/>
1287 	 *     </downed>
1288 	 *  </pseudo_event>
1289 	 */
1290 	#define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1291 	    "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1292 	
1293 	/*!
1294 	 * \internal
1295 	 * \brief Check a pseudo-action for Pacemaker Remote node side effects
1296 	 *
1297 	 * \param[in,out] xml  XML of pseudo-action to check
1298 	 */
1299 	void
1300 	remote_ra_process_pseudo(xmlNode *xml)
1301 	{
1302 	    xmlXPathObject *search = pcmk__xpath_search(xml->doc, XPATH_PSEUDO_FENCE);
1303 	
1304 	    if (pcmk__xpath_num_results(search) == 1) {
1305 	        xmlNode *result = pcmk__xpath_result(search, 0);
1306 	
1307 	        /* Normally, we handle the necessary side effects of a guest node stop
1308 	         * action when reporting the remote agent's result. However, if the stop
1309 	         * is implied due to fencing, it will be a fencing pseudo-event, and
1310 	         * there won't be a result to report. Handle that case here.
1311 	         *
1312 	         * This will result in a duplicate call to remote_node_down() if the
1313 	         * guest stop was real instead of implied, but that shouldn't hurt.
1314 	         *
1315 	         * There is still one corner case that isn't handled: if a guest node
1316 	         * isn't running any resources when its host is fenced, it will appear
1317 	         * to be cleanly stopped, so there will be no pseudo-fence, and our
1318 	         * peer cache state will be incorrect unless and until the guest is
1319 	         * recovered.
1320 	         */
1321 	        if (result != NULL) {
1322 	            const char *remote = pcmk__xe_id(result);
1323 	
1324 	            if (remote != NULL) {
1325 	                remote_node_down(remote, true);
1326 	            }
1327 	        }
1328 	    }
1329 	    xmlXPathFreeObject(search);
1330 	}
1331 	
1332 	static void
1333 	remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
1334 	{
1335 	    xmlNode *update, *state;
1336 	    int call_opt;
1337 	    pcmk__node_status_t *node = NULL;
1338 	
1339 	    call_opt = crmd_cib_smart_opt();
1340 	    node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1341 	    CRM_CHECK(node != NULL, return);
1342 	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1343 	    state = create_node_state_update(node, controld_node_update_none, update,
1344 	                                     __func__);
1345 	    pcmk__xe_set(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1346 	    if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1347 	                            NULL) == pcmk_rc_ok) {
1348 	        /* TODO: still not 100% sure that async update will succeed ... */
1349 	        if (maintenance) {
1350 	            lrm_remote_set_flags(lrm_state, remote_in_maint);
1351 	        } else {
1352 	            lrm_remote_clear_flags(lrm_state, remote_in_maint);
1353 	        }
1354 	    }
1355 	    pcmk__xml_free(update);
1356 	}
1357 	
1358 	#define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1359 	    "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1360 	    PCMK__XE_MAINTENANCE
1361 	
1362 	/*!
1363 	 * \internal
1364 	 * \brief Check a pseudo-action holding updates for maintenance state
1365 	 *
1366 	 * \param[in,out] xml  XML of pseudo-action to check
1367 	 */
1368 	void
1369 	remote_ra_process_maintenance_nodes(xmlNode *xml)
1370 	{
1371 	    xmlNode *maint = pcmk__xpath_find_one(xml->doc, XPATH_PSEUDO_MAINTENANCE,
1372 	                                          PCMK__LOG_NEVER);
1373 	
1374 	    for (xmlNode *node = pcmk__xe_first_child(maint, PCMK_XE_NODE, NULL, NULL);
1375 	         node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1376 	
1377 	        lrm_state_t *lrm_state = NULL;
1378 	        const char *id = pcmk__xe_id(node);
1379 	
1380 	        if (id == NULL) {
1381 	            CRM_LOG_ASSERT(id != NULL);
1382 	            continue;
1383 	        }
1384 	
1385 	        lrm_state = controld_get_executor_state(id, false);
1386 	
1387 	        if ((lrm_state != NULL) && (lrm_state->remote_ra_data != NULL)
1388 	            && pcmk__is_set(lrm_state->remote_ra_data->status, remote_active)) {
1389 	
1390 	            const bool in_maint =
1391 	                pcmk__xe_attr_is_true(node, PCMK__XA_NODE_IN_MAINTENANCE);
1392 	
1393 	            remote_ra_maintenance(lrm_state, in_maint);
1394 	        }
1395 	    }
1396 	}
1397 	
1398 	gboolean
1399 	remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
1400 	{
1401 	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1402 	    return pcmk__is_set(ra_data->status, remote_in_maint);
1403 	}
1404 	
1405 	gboolean
1406 	remote_ra_controlling_guest(lrm_state_t * lrm_state)
1407 	{
1408 	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1409 	    return pcmk__is_set(ra_data->status, controlling_guest);
1410 	}
1411