Coverity error reader: /pacemaker/daemons/controld/controld_remote

1    	/*
2    	 * Copyright 2013-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <stdbool.h>
13   	#include <stdint.h>                    // UINT32_C
14   	
15   	#include <crm/crm.h>
16   	#include <crm/common/xml.h>
17   	#include <crm/lrmd.h>
18   	#include <crm/lrmd_internal.h>
19   	#include <crm/services.h>
20   	
21   	#include <libxml/xpath.h>               // xmlXPathObject, etc.
22   	
23   	#include <pacemaker-controld.h>
24   	
25   	#define REMOTE_LRMD_RA "remote"
26   	
27   	/* The max start timeout before cmd retry */
28   	#define MAX_START_TIMEOUT_MS 10000
29   	
30   	#define cmd_set_flags(cmd, flags_to_set) do { \
31   	    (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
32   	                                       "Remote command", (cmd)->rsc_id, (cmd)->status, \
33   	                                       (flags_to_set), #flags_to_set); \
34   	        } while (0)
35   	
36   	#define cmd_clear_flags(cmd, flags_to_clear) do { \
37   	    (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
38   	                                         "Remote command", (cmd)->rsc_id, (cmd)->status, \
39   	                                         (flags_to_clear), #flags_to_clear); \
40   	        } while (0)
41   	
42   	enum remote_cmd_status {
43   	    cmd_reported_success    = (UINT32_C(1) << 0),
44   	    cmd_cancel              = (UINT32_C(1) << 1),
45   	};
46   	
47   	#define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
48   	    lrm_state_t *lrm = (lrm_state); \
49   	    remote_ra_data_t *ra = lrm->remote_ra_data; \
50   	    ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
51   	                                    lrm->node_name, ra->status, \
52   	                                    (flags_to_set), #flags_to_set); \
53   	        } while (0)
54   	
55   	#define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
56   	    lrm_state_t *lrm = (lrm_state); \
57   	    remote_ra_data_t *ra = lrm->remote_ra_data; \
58   	    ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
59   	                                      lrm->node_name, ra->status, \
60   	                                      (flags_to_clear), #flags_to_clear); \
61   	        } while (0)
62   	
63   	enum remote_status {
64   	    expect_takeover     = (UINT32_C(1) << 0),
65   	    takeover_complete   = (UINT32_C(1) << 1),
66   	    remote_active       = (UINT32_C(1) << 2),
67   	    /* Maintenance mode is difficult to determine from the controller's context,
68   	     * so we have it signalled back with the transition from the scheduler.
69   	     */
70   	    remote_in_maint     = (UINT32_C(1) << 3),
71   	    /* Similar for whether we are controlling a guest node or remote node.
72   	     * Fortunately there is a meta-attribute in the transition already and
73   	     * as the situation doesn't change over time we can use the
74   	     * resource start for noting down the information for later use when
75   	     * the attributes aren't at hand.
76   	     */
77   	    controlling_guest   = (UINT32_C(1) << 4),
78   	};
79   	
80   	static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
81   	static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
82   	static GList *fail_all_monitor_cmds(GList * list);
83   	
84   	static void
85   	free_cmd(gpointer user_data)
86   	{
87   	    remote_ra_cmd_t *cmd = user_data;
88   	
89   	    if (!cmd) {
90   	        return;
91   	    }
92   	    if (cmd->delay_id) {
93   	        g_source_remove(cmd->delay_id);
94   	    }
95   	    if (cmd->interval_id) {
96   	        g_source_remove(cmd->interval_id);
97   	    }
98   	    if (cmd->monitor_timeout_id) {
99   	        g_source_remove(cmd->monitor_timeout_id);
100  	    }
101  	    if (cmd->takeover_timeout_id) {
102  	        g_source_remove(cmd->takeover_timeout_id);
103  	    }
104  	    free(cmd->owner);
105  	    free(cmd->rsc_id);
106  	    free(cmd->action);
107  	    free(cmd->userdata);
108  	    pcmk__reset_result(&(cmd->result));
109  	    lrmd_key_value_freeall(cmd->params);
110  	    free(cmd);
111  	}
112  	
113  	static int
114  	generate_callid(void)
115  	{
116  	    static int remote_ra_callid = 0;
117  	
118  	    remote_ra_callid++;
119  	    if (remote_ra_callid <= 0) {
120  	        remote_ra_callid = 1;
121  	    }
122  	
123  	    return remote_ra_callid;
124  	}
125  	
126  	static gboolean
127  	recurring_helper(gpointer data)
128  	{
129  	    remote_ra_cmd_t *cmd = data;
130  	    lrm_state_t *connection_rsc = NULL;
131  	
132  	    cmd->interval_id = 0;
133  	    connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
134  	    if (connection_rsc && connection_rsc->remote_ra_data) {
135  	        remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
136  	
137  	        ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
138  	
139  	        ra_data->cmds = g_list_append(ra_data->cmds, cmd);
140  	        mainloop_set_trigger(ra_data->work);
141  	    }
142  	    return FALSE;
143  	}
144  	
145  	static gboolean
146  	start_delay_helper(gpointer data)
147  	{
148  	    remote_ra_cmd_t *cmd = data;
149  	    lrm_state_t *connection_rsc = NULL;
150  	
151  	    cmd->delay_id = 0;
152  	    connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
153  	    if (connection_rsc && connection_rsc->remote_ra_data) {
154  	        remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
155  	
156  	        mainloop_set_trigger(ra_data->work);
157  	    }
158  	    return FALSE;
159  	}
160  	
161  	static bool
162  	should_purge_attributes(pcmk__node_status_t *node)
163  	{
164  	    pcmk__node_status_t *conn_node = NULL;
165  	    lrm_state_t *connection_rsc = NULL;
166  	
167  	    if ((node->conn_host == NULL) || (node->name == NULL)) {
168  	        return true;
169  	    }
170  	
171  	    /* Get the node that was hosting the remote connection resource from the
172  	     * peer cache.  That's the one we really care about here.
173  	     */
174  	    conn_node = pcmk__get_node(0, node->conn_host, NULL,
175  	                               pcmk__node_search_cluster_member);
176  	    if (conn_node == NULL) {
177  	        return true;
178  	    }
179  	
180  	    /* Check the uptime of connection_rsc.  If it hasn't been running long
181  	     * enough, set purge=true.  "Long enough" means it started running earlier
182  	     * than the timestamp when we noticed it went away in the first place.
183  	     */
184  	    connection_rsc = controld_get_executor_state(node->name, false);
185  	
186  	    if (connection_rsc != NULL) {
187  	        lrmd_t *lrm = connection_rsc->conn;
188  	        time_t uptime = lrmd__uptime(lrm);
189  	        time_t now = time(NULL);
190  	
191  	        /* Add 20s of fuzziness to give corosync a while to notice the remote
192  	         * host is gone.  On various error conditions (failure to get uptime,
193  	         * peer_lost isn't set) we default to purging.
194  	         */
195  	        if (uptime > 0 &&
196  	            conn_node->peer_lost > 0 &&
197  	            uptime + 20 >= now - conn_node->peer_lost) {
198  	            return false;
199  	        }
200  	    }
201  	
202  	    return true;
203  	}
204  	
205  	static void
206  	purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
207  	{
208  	    const bool unlocked_only = pcmk__is_set(controld_globals.flags,
209  	                                            controld_shutdown_lock_enabled);
210  	
211  	    // Purge node's transient attributes (from attribute manager and CIB)
212  	    if (should_purge_attributes(node)) {
213  	        controld_purge_node_attrs(node->name, true);
214  	    }
215  	
216  	    controld_delete_node_history(node->name, unlocked_only, call_opt);
217  	}
218  	
219  	/*!
220  	 * \internal
221  	 * \brief Handle cluster communication related to pacemaker_remote node joining
222  	 *
223  	 * \param[in] node_name  Name of newly integrated pacemaker_remote node
224  	 */
225  	static void
226  	remote_node_up(const char *node_name)
227  	{
228  	    int call_opt;
229  	    xmlNode *update, *state;
230  	    pcmk__node_status_t *node = NULL;
231  	    lrm_state_t *connection_rsc = NULL;
232  	
233  	    CRM_CHECK(node_name != NULL, return);
234  	    pcmk__info("Announcing Pacemaker Remote node %s", node_name);
235  	
236  	    call_opt = crmd_cib_smart_opt();
237  	
238  	    /* Delete node's CRM_OP_PROBED attribute. Deleting any attribute ensures
239  	     * that the attribute manager learns the node is remote. Deletion of this
240  	     * specfic attribute is a holdover from when it had special meaning.
241  	     *
242  	     * @COMPAT Find another way to tell attrd that the node is remote, without
243  	     * risking deletion or overwrite of an arbitrary attribute. Then work on
244  	     * deprecating CRM_OP_PROBED.
245  	     */
246  	    update_attrd(node_name, CRM_OP_PROBED, NULL, true);
247  	
248  	    /* Ensure node is in the remote peer cache with member status */
249  	    node = pcmk__cluster_lookup_remote_node(node_name);
250  	    CRM_CHECK((node != NULL) && (node->name != NULL), return);
251  	
252  	    purge_remote_node_attrs(call_opt, node);
253  	    pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
254  	
255  	    /* Apply any start state that we were given from the environment on the
256  	     * remote node.
257  	     */
258  	    connection_rsc = controld_get_executor_state(node->name, false);
259  	
260  	    if (connection_rsc != NULL) {
261  	        lrmd_t *lrm = connection_rsc->conn;
262  	        const char *start_state = lrmd__node_start_state(lrm);
263  	
264  	        if (start_state) {
265  	            set_join_state(start_state, node->name, node->xml_id, true);
266  	        }
267  	    }
268  	
269  	    /* pacemaker_remote nodes don't participate in the membership layer,
270  	     * so cluster nodes don't automatically get notified when they come and go.
271  	     * We send a cluster message to the DC, and update the CIB node state entry,
272  	     * so the DC will get it sooner (via message) or later (via CIB refresh),
273  	     * and any other interested parties can query the CIB.
274  	     */
275  	    broadcast_remote_state_message(node_name, true);
276  	
277  	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
278  	    state = create_node_state_update(node, controld_node_update_cluster, update,
279  	                                     __func__);
280  	
281  	    /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
282  	     * needs to be fenced, this flag will allow various actions to determine
283  	     * whether the fencing has happened yet.
284  	     */
285  	    pcmk__xe_set(state, PCMK__XA_NODE_FENCED, "0");
286  	
287  	    /* TODO: If the remote connection drops, and this (async) CIB update either
288  	     * failed or has not yet completed, later actions could mistakenly think the
289  	     * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
290  	     * previously set, because it won't have been cleared). This could prevent
291  	     * actual fencing or allow recurring monitor failures to be cleared too
292  	     * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
293  	     */
294  	    controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
295  	    pcmk__xml_free(update);
296  	}
297  	
298  	/*!
299  	 * \internal
300  	 * \brief Handle cluster communication related to pacemaker_remote node leaving
301  	 *
302  	 * \param[in] node_name  Name of lost node
303  	 * \param[in] erase_lrm  If \c true, erase the LRM history
304  	 */
305  	static void
306  	remote_node_down(const char *node_name, bool erase_lrm)
307  	{
308  	    xmlNode *update;
309  	    int call_opt = crmd_cib_smart_opt();
310  	    pcmk__node_status_t *node = NULL;
311  	
312  	    // Purge node's transient attributes (from attribute manager and CIB)
313  	    controld_purge_node_attrs(node_name, true);
314  	
315  	    /* Normally, the resource history should be kept until the node comes back
316  	     * up. However, after a successful fence, clear the history so we don't
317  	     * think resources are still running on the node.
318  	     */
319  	    if (erase_lrm) {
320  	        controld_delete_node_history(node_name, false, call_opt);
321  	    }
322  	
323  	    /* Ensure node is in the remote peer cache with lost state */
324  	    node = pcmk__cluster_lookup_remote_node(node_name);
325  	    CRM_CHECK(node != NULL, return);
326  	    pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
327  	
328  	    /* Notify DC */
329  	    broadcast_remote_state_message(node_name, false);
330  	
331  	    /* Update CIB node state */
332  	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
333  	    create_node_state_update(node, controld_node_update_cluster, update,
334  	                             __func__);
335  	    controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
336  	    pcmk__xml_free(update);
337  	}
338  	
339  	/*!
340  	 * \internal
341  	 * \brief Handle effects of a remote RA command on node state
342  	 *
343  	 * \param[in] cmd  Completed remote RA command
344  	 */
345  	static void
346  	check_remote_node_state(const remote_ra_cmd_t *cmd)
347  	{
348  	    /* Only successful actions can change node state */
349  	    if (!pcmk__result_ok(&(cmd->result))) {
350  	        return;
351  	    }
352  	
353  	    if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
354  	        remote_node_up(cmd->rsc_id);
355  	
356  	    } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
357  	                            pcmk__str_casei)) {
358  	        /* After a successful migration, we don't need to do remote_node_up()
359  	         * because the DC already knows the node is up, and we don't want to
360  	         * clear LRM history etc. We do need to add the remote node to this
361  	         * host's remote peer cache, because (unless it happens to be DC)
362  	         * it hasn't been tracking the remote node, and other code relies on
363  	         * the cache to distinguish remote nodes from unseen cluster nodes.
364  	         */
365  	        pcmk__node_status_t *node =
366  	            pcmk__cluster_lookup_remote_node(cmd->rsc_id);
367  	
368  	        CRM_CHECK(node != NULL, return);
369  	        pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
370  	
371  	    } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
372  	        lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
373  	                                                             false);
374  	        remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
375  	
376  	        if (ra_data) {
377  	            if (!pcmk__is_set(ra_data->status, takeover_complete)) {
378  	                /* Stop means down if we didn't successfully migrate elsewhere */
379  	                remote_node_down(cmd->rsc_id, false);
380  	            } else if (AM_I_DC == FALSE) {
381  	                /* Only the connection host and DC track node state,
382  	                 * so if the connection migrated elsewhere and we aren't DC,
383  	                 * un-cache the node, so we don't have stale info
384  	                 */
385  	                pcmk__cluster_forget_remote_node(cmd->rsc_id);
386  	            }
387  	        }
388  	    }
389  	
390  	    /* We don't do anything for successful monitors, which is correct for
391  	     * routine recurring monitors, and for monitors on nodes where the
392  	     * connection isn't supposed to be (the cluster will stop the connection in
393  	     * that case). However, if the initial probe finds the connection already
394  	     * active on the node where we want it, we probably should do
395  	     * remote_node_up(). Unfortunately, we can't distinguish that case here.
396  	     * Given that connections have to be initiated by the cluster, the chance of
397  	     * that should be close to zero.
398  	     */
399  	}
400  	
401  	static void
402  	report_remote_ra_result(remote_ra_cmd_t * cmd)
403  	{
404  	    lrmd_event_data_t op = { 0, };
405  	
406  	    check_remote_node_state(cmd);
407  	
408  	    op.type = lrmd_event_exec_complete;
409  	    op.rsc_id = cmd->rsc_id;
410  	    op.op_type = cmd->action;
411  	    op.user_data = cmd->userdata;
412  	    op.timeout = cmd->timeout;
413  	    op.interval_ms = cmd->interval_ms;
414  	    op.t_run = cmd->start_time;
415  	    op.t_rcchange = cmd->start_time;
416  	
417  	    lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
418  	                     cmd->result.exit_reason);
419

(1) Event path:	Condition "pcmk__is_set(cmd->status, cmd_reported_success)", taking true branch.
(2) Event path:	Condition "!pcmk__result_ok(&cmd->result)", taking false branch.

420  	    if (pcmk__is_set(cmd->status, cmd_reported_success)
421  	        && !pcmk__result_ok(&(cmd->result))) {
422  	
423  	        op.t_rcchange = time(NULL);
424  	        /* This edge case will likely never ever occur, but if it does the
425  	         * result is that a failure will not be processed correctly. This is only
426  	         * remotely possible because we are able to detect a connection resource's tcp
427  	         * connection has failed at any moment after start has completed. The actual
428  	         * recurring operation is just a connectivity ping.
429  	         *
430  	         * basically, we are not guaranteed that the first successful monitor op and
431  	         * a subsequent failed monitor op will not occur in the same timestamp. We have to
432  	         * make it look like the operations occurred at separate times though. */
433  	        if (op.t_rcchange == op.t_run) {
434  	            op.t_rcchange++;
435  	        }
436  	    }
437

(3) Event path:

Condition "cmd->params", taking true branch.

438  	    if (cmd->params) {
439  	        lrmd_key_value_t *tmp;
440  	
441  	        op.params = pcmk__strkey_table(free, free);

(4) Event path:	Condition "tmp", taking true branch.
(6) Event path:	Condition "tmp", taking false branch.

442  	        for (tmp = cmd->params; tmp; tmp = tmp->next) {
443  	            pcmk__insert_dup(op.params, tmp->key, tmp->value);

(5) Event path:

Jumping back to the beginning of the loop.

444  	        }
445  	
446  	    }
447  	    op.call_id = cmd->call_id;
448  	    op.remote_nodename = cmd->owner;
449  	
450  	    lrm_op_callback(&op);
451

CID (unavailable; MK=2581508de773dadfc07268f26719d327) (#1 of 1): Inconsistent C union access (INCONSISTENT_UNION_ACCESS):

(7) Event assign_union_field:	The union field "in" of "_pp" is written.
(8) Event inconsistent_union_field_access:	In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in".

452  	    g_clear_pointer(&op.params, g_hash_table_destroy);
453  	    lrmd__reset_result(&op);
454  	}
455  	
456  	/*!
457  	 * \internal
458  	 * \brief Return a remote command's remaining timeout in seconds
459  	 *
460  	 * \param[in] cmd  Remote command to check
461  	 *
462  	 * \return Command's remaining timeout in seconds
463  	 */
464  	static int
465  	remaining_timeout_sec(const remote_ra_cmd_t *cmd)
466  	{
467  	    return pcmk__timeout_ms2s(cmd->timeout) - (time(NULL) - cmd->start_time);
468  	}
469  	
470  	static gboolean
471  	retry_start_cmd_cb(gpointer data)
472  	{
473  	    lrm_state_t *lrm_state = data;
474  	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
475  	    remote_ra_cmd_t *cmd = NULL;
476  	    int rc = ETIME;
477  	    int remaining = 0;
478  	
479  	    if (!ra_data || !ra_data->cur_cmd) {
480  	        return FALSE;
481  	    }
482  	    cmd = ra_data->cur_cmd;
483  	    if (!pcmk__is_up_action(cmd->action)) {
484  	        return FALSE;
485  	    }
486  	
487  	    remaining = remaining_timeout_sec(cmd);
488  	    if (remaining > 0) {
489  	        rc = handle_remote_ra_start(lrm_state, cmd, remaining * 1000);
490  	    } else {
491  	        pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
492  	                         PCMK_EXEC_TIMEOUT,
493  	                         "Not enough time remains to retry remote connection");
494  	    }
495  	
496  	    if (rc != pcmk_rc_ok) {
497  	        report_remote_ra_result(cmd);
498  	
499  	        if (ra_data->cmds) {
500  	            mainloop_set_trigger(ra_data->work);
501  	        }
502  	        ra_data->cur_cmd = NULL;
503  	        free_cmd(cmd);
504  	    } else {
505  	        /* wait for connection event */
506  	    }
507  	
508  	    return FALSE;
509  	}
510  	
511  	
512  	static gboolean
513  	connection_takeover_timeout_cb(gpointer data)
514  	{
515  	    lrm_state_t *lrm_state = NULL;
516  	    remote_ra_cmd_t *cmd = data;
517  	
518  	    pcmk__info("takeover event timed out for node %s", cmd->rsc_id);
519  	    cmd->takeover_timeout_id = 0;
520  	
521  	    lrm_state = controld_get_executor_state(cmd->rsc_id, false);
522  	
523  	    handle_remote_ra_stop(lrm_state, cmd);
524  	    free_cmd(cmd);
525  	
526  	    return FALSE;
527  	}
528  	
529  	static gboolean
530  	monitor_timeout_cb(gpointer data)
531  	{
532  	    lrm_state_t *lrm_state = NULL;
533  	    remote_ra_cmd_t *cmd = data;
534  	
535  	    lrm_state = controld_get_executor_state(cmd->rsc_id, false);
536  	
537  	    pcmk__info("Timed out waiting for remote poke response from %s%s",
538  	               cmd->rsc_id, ((lrm_state != NULL)? "" : " (no LRM state)"));
539  	    cmd->monitor_timeout_id = 0;
540  	    pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
541  	                     "Remote executor did not respond");
542  	
543  	    if (lrm_state && lrm_state->remote_ra_data) {
544  	        remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
545  	
546  	        if (ra_data->cur_cmd == cmd) {
547  	            ra_data->cur_cmd = NULL;
548  	        }
549  	        if (ra_data->cmds) {
550  	            mainloop_set_trigger(ra_data->work);
551  	        }
552  	    }
553  	
554  	    report_remote_ra_result(cmd);
555  	    free_cmd(cmd);
556  	
557  	    if(lrm_state) {
558  	        // @TODO Should we move this before reporting the result above?
559  	        lrm_state_disconnect(lrm_state);
560  	    }
561  	    return FALSE;
562  	}
563  	
564  	static void
565  	synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
566  	{
567  	    lrmd_event_data_t op = { 0, };
568  	
569  	    if (lrm_state == NULL) {
570  	        /* if lrm_state not given assume local */
571  	        lrm_state = controld_get_executor_state(NULL, false);
572  	    }
573  	    pcmk__assert(lrm_state != NULL);
574  	
575  	    op.type = lrmd_event_exec_complete;
576  	    op.rsc_id = rsc_id;
577  	    op.op_type = op_type;
578  	    op.t_run = time(NULL);
579  	    op.t_rcchange = op.t_run;
580  	    op.call_id = generate_callid();
581  	    lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
582  	    process_lrm_event(lrm_state, &op, NULL, NULL);
583  	}
584  	
585  	void
586  	remote_lrm_op_callback(lrmd_event_data_t * op)
587  	{
588  	    gboolean cmd_handled = FALSE;
589  	    lrm_state_t *lrm_state = NULL;
590  	    remote_ra_data_t *ra_data = NULL;
591  	    remote_ra_cmd_t *cmd = NULL;
592  	
593  	    CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
594  	
595  	    pcmk__debug("Processing '%s%s%s' event on remote connection to %s: %s "
596  	                "(%d) status=%s (%d)",
597  	                pcmk__s(op->op_type, ""), ((op->op_type != NULL)? " " : ""),
598  	                lrmd_event_type2str(op->type), op->remote_nodename,
599  	                crm_exit_str((crm_exit_t) op->rc), op->rc,
600  	                pcmk_exec_status_str(op->op_status), op->op_status);
601  	
602  	    lrm_state = controld_get_executor_state(op->remote_nodename, false);
603  	    if (!lrm_state || !lrm_state->remote_ra_data) {
604  	        pcmk__debug("No state information found for remote connection event");
605  	        return;
606  	    }
607  	    ra_data = lrm_state->remote_ra_data;
608  	
609  	    if (op->type == lrmd_event_new_client) {
610  	        // Another client has connected to the remote daemon
611  	
612  	        if (pcmk__is_set(ra_data->status, expect_takeover)) {
613  	            // Great, we knew this was coming
614  	            lrm_remote_clear_flags(lrm_state, expect_takeover);
615  	            lrm_remote_set_flags(lrm_state, takeover_complete);
616  	
617  	        } else {
618  	            pcmk__err("Disconnecting from Pacemaker Remote node %s due to "
619  	                      "unexpected client takeover",
620  	                      op->remote_nodename);
621  	            /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
622  	            /* Do not free lrm_state->conn yet. */
623  	            /* It'll be freed in the following stop action. */
624  	            lrm_state_disconnect_only(lrm_state);
625  	        }
626  	        return;
627  	    }
628  	
629  	    /* filter all EXEC events up */
630  	    if (op->type == lrmd_event_exec_complete) {
631  	        if (pcmk__is_set(ra_data->status, takeover_complete)) {
632  	            pcmk__debug("Ignoring event, this connection is taken over by "
633  	                        "another node");
634  	        } else {
635  	            lrm_op_callback(op);
636  	        }
637  	        return;
638  	    }
639  	
640  	    if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
641  	
642  	        if (!pcmk__is_set(ra_data->status, remote_active)) {
643  	            pcmk__debug("Disconnection from Pacemaker Remote node %s complete",
644  	                        lrm_state->node_name);
645  	
646  	        } else if (!remote_ra_is_in_maintenance(lrm_state)) {
647  	            pcmk__err("Lost connection to Pacemaker Remote node %s",
648  	                      lrm_state->node_name);
649  	            ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
650  	            ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
651  	
652  	        } else {
653  	            pcmk__notice("Unmanaged Pacemaker Remote node %s disconnected",
654  	                         lrm_state->node_name);
655  	            /* Do roughly what a 'stop' on the remote-resource would do */
656  	            handle_remote_ra_stop(lrm_state, NULL);
657  	            remote_node_down(lrm_state->node_name, false);
658  	            /* now fake the reply of a successful 'stop' */
659  	            synthesize_lrmd_success(NULL, lrm_state->node_name,
660  	                                    PCMK_ACTION_STOP);
661  	        }
662  	        return;
663  	    }
664  	
665  	    if (!ra_data->cur_cmd) {
666  	        pcmk__debug("No event to match");
667  	        return;
668  	    }
669  	
670  	    cmd = ra_data->cur_cmd;
671  	
672  	    /* Start actions and migrate from actions complete after connection
673  	     * comes back to us. */
674  	    if ((op->type == lrmd_event_connect) && pcmk__is_up_action(cmd->action)) {
675  	        if (op->connection_rc < 0) {
676  	            int remaining = remaining_timeout_sec(cmd);
677  	
678  	            if ((op->connection_rc == -ENOKEY)
679  	                || (op->connection_rc == -EKEYREJECTED)) {
680  	                // Hard error, don't retry
681  	                pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
682  	                                 PCMK_EXEC_ERROR,
683  	                                 pcmk_strerror(op->connection_rc));
684  	
685  	            } else if (remaining > 3) {
686  	                pcmk__trace("Rescheduling start (%ds remains before timeout)",
687  	                            remaining);
688  	                pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
689  	                return;
690  	
691  	            } else {
692  	                pcmk__trace("Not enough time before timeout (%ds) to "
693  	                            "reschedule start",
694  	                            remaining);
695  	                pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
696  	                                    PCMK_EXEC_TIMEOUT,
697  	                                    "%s without enough time to retry",
698  	                                    pcmk_strerror(op->connection_rc));
699  	            }
700  	
701  	        } else {
702  	            lrm_state_reset_tables(lrm_state, TRUE);
703  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
704  	            lrm_remote_set_flags(lrm_state, remote_active);
705  	        }
706  	
707  	        pcmk__debug("Remote connection event matched %s action", cmd->action);
708  	        report_remote_ra_result(cmd);
709  	        cmd_handled = TRUE;
710  	
711  	    } else if ((op->type == lrmd_event_poke)
712  	               && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
713  	                               pcmk__str_casei)) {
714  	
715  	        if (cmd->monitor_timeout_id) {
716  	            g_source_remove(cmd->monitor_timeout_id);
717  	            cmd->monitor_timeout_id = 0;
718  	        }
719  	
720  	        /* Only report success the first time, after that only worry about failures.
721  	         * For this function, if we get the poke pack, it is always a success. Pokes
722  	         * only fail if the send fails, or the response times out. */
723  	        if (!pcmk__is_set(cmd->status, cmd_reported_success)) {
724  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
725  	            report_remote_ra_result(cmd);
726  	            cmd_set_flags(cmd, cmd_reported_success);
727  	        }
728  	
729  	        pcmk__debug("Remote poke event matched %s action", cmd->action);
730  	
731  	        /* success, keep rescheduling if interval is present. */
732  	        if (cmd->interval_ms && !pcmk__is_set(cmd->status, cmd_cancel)) {
733  	            ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
734  	            cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
735  	                                                  recurring_helper, cmd);
736  	            cmd = NULL;         /* prevent free */
737  	        }
738  	        cmd_handled = TRUE;
739  	
740  	    } else if ((op->type == lrmd_event_disconnect)
741  	               && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
742  	                               pcmk__str_casei)) {
743  	        if (pcmk__is_set(ra_data->status, remote_active)
744  	            && !pcmk__is_set(cmd->status, cmd_cancel)) {
745  	
746  	            pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
747  	                             PCMK_EXEC_ERROR,
748  	                             "Remote connection unexpectedly dropped "
749  	                             "during monitor");
750  	            report_remote_ra_result(cmd);
751  	            pcmk__err("Remote connection to %s unexpectedly dropped during "
752  	                      "monitor",
753  	                      lrm_state->node_name);
754  	        }
755  	        cmd_handled = TRUE;
756  	
757  	    } else {
758  	        pcmk__debug("Event did not match %s action", ra_data->cur_cmd->action);
759  	    }
760  	
761  	    if (cmd_handled) {
762  	        ra_data->cur_cmd = NULL;
763  	        if (ra_data->cmds) {
764  	            mainloop_set_trigger(ra_data->work);
765  	        }
766  	        free_cmd(cmd);
767  	    }
768  	}
769  	
770  	static void
771  	handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
772  	{
773  	    pcmk__assert(lrm_state != NULL);
774  	
775  	    if (!pcmk__is_set(lrm_state->remote_ra_data->status, takeover_complete)) {
776  	        /* delete pending ops when ever the remote connection is intentionally stopped */
777  	        g_hash_table_remove_all(lrm_state->active_ops);
778  	    } else {
779  	        /* we no longer hold the history if this connection has been migrated,
780  	         * however, we keep metadata cache for future use */
781  	        lrm_state_reset_tables(lrm_state, FALSE);
782  	    }
783  	
784  	    lrm_remote_clear_flags(lrm_state, remote_active);
785  	    lrm_state_disconnect(lrm_state);
786  	
787  	    g_list_free_full(lrm_state->remote_ra_data->cmds, free_cmd);
788  	    lrm_state->remote_ra_data->cmds = NULL;
789  	
790  	    g_list_free_full(lrm_state->remote_ra_data->recurring_cmds, free_cmd);
791  	    lrm_state->remote_ra_data->recurring_cmds = NULL;
792  	
793  	    lrm_state->remote_ra_data->cur_cmd = NULL;
794  	
795  	    if (cmd) {
796  	        pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
797  	        report_remote_ra_result(cmd);
798  	    }
799  	}
800  	
801  	// \return Standard Pacemaker return code
802  	static int
803  	handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
804  	{
805  	    const char *server = NULL;
806  	    lrmd_key_value_t *tmp = NULL;
807  	    int port = 0;
808  	    int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
809  	    int rc = pcmk_rc_ok;
810  	
811  	    for (tmp = cmd->params; tmp; tmp = tmp->next) {
812  	        if (pcmk__strcase_any_of(tmp->key,
813  	                                 PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
814  	                                 NULL)) {
815  	            server = tmp->value;
816  	
817  	        } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
818  	                                pcmk__str_none)) {
819  	            port = atoi(tmp->value);
820  	
821  	        } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
822  	                                pcmk__str_none)) {
823  	            lrm_remote_set_flags(lrm_state, controlling_guest);
824  	        }
825  	    }
826  	
827  	    rc = controld_connect_remote_executor(lrm_state, server, port,
828  	                                          timeout_used);
829  	    if (rc != pcmk_rc_ok) {
830  	        pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
831  	                            PCMK_EXEC_ERROR,
832  	                            "Could not connect to Pacemaker Remote node %s: %s",
833  	                            lrm_state->node_name, pcmk_rc_str(rc));
834  	    }
835  	    return rc;
836  	}
837  	
838  	static gboolean
839  	handle_remote_ra_exec(gpointer user_data)
840  	{
841  	    int rc = 0;
842  	    lrm_state_t *lrm_state = user_data;
843  	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
844  	    remote_ra_cmd_t *cmd;
845  	    GList *first = NULL;
846  	
847  	    if (ra_data->cur_cmd) {
848  	        /* still waiting on previous cmd */
849  	        return TRUE;
850  	    }
851  	
852  	    while (ra_data->cmds) {
853  	        first = ra_data->cmds;
854  	        cmd = first->data;
855  	        if (cmd->delay_id) {
856  	            /* still waiting for start delay timer to trip */
857  	            return TRUE;
858  	        }
859  	
860  	        ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
861  	        g_list_free_1(first);
862  	
863  	        if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
864  	                             PCMK_ACTION_MIGRATE_FROM, NULL)) {
865  	            lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
866  	            if (handle_remote_ra_start(lrm_state, cmd,
867  	                                       cmd->timeout) == pcmk_rc_ok) {
868  	                /* take care of this later when we get async connection result */
869  	                pcmk__debug("Initiated async remote connection, %s action will "
870  	                            "complete after connect event",
871  	                            cmd->action);
872  	                ra_data->cur_cmd = cmd;
873  	                return TRUE;
874  	            }
875  	            report_remote_ra_result(cmd);
876  	
877  	        } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
878  	
879  	            if (lrm_state_is_connected(lrm_state) == TRUE) {
880  	                rc = lrm_state_poke_connection(lrm_state);
881  	                if (rc < 0) {
882  	                    pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
883  	                                     PCMK_EXEC_ERROR, pcmk_strerror(rc));
884  	                }
885  	            } else {
886  	                rc = -1;
887  	                pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
888  	                                 PCMK_EXEC_DONE, "Remote connection inactive");
889  	            }
890  	
891  	            if (rc == 0) {
892  	                pcmk__debug("Poked Pacemaker Remote at node %s, waiting for "
893  	                            "async response",
894  	                            cmd->rsc_id);
895  	                ra_data->cur_cmd = cmd;
896  	                cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
897  	                return TRUE;
898  	            }
899  	            report_remote_ra_result(cmd);
900  	
901  	        } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
902  	
903  	            if (pcmk__is_set(ra_data->status, expect_takeover)) {
904  	                /* Briefly wait on stop for an expected takeover to occur. If
905  	                 * the takeover does not occur during the wait, that's fine; it
906  	                 * just means that the remote node's resource history will be
907  	                 * cleared, which will require probing all resources on the
908  	                 * remote node. If the takeover does occur successfully, then we
909  	                 * can leave the status section intact.
910  	                 */
911  	                cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
912  	                                                              connection_takeover_timeout_cb,
913  	                                                              cmd);
914  	                ra_data->cur_cmd = cmd;
915  	                return TRUE;
916  	            }
917  	
918  	            handle_remote_ra_stop(lrm_state, cmd);
919  	
920  	        } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
921  	            lrm_remote_clear_flags(lrm_state, takeover_complete);
922  	            lrm_remote_set_flags(lrm_state, expect_takeover);
923  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
924  	            report_remote_ra_result(cmd);
925  	
926  	        } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
927  	                                    PCMK_ACTION_RELOAD_AGENT, NULL))  {
928  	            /* Currently the only reloadable parameter is
929  	             * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
930  	             * scheduler via the CIB, so reloads are a no-op.
931  	             *
932  	             * @COMPAT DC <2.1.0: We only need to check for "reload" in case
933  	             * we're in a rolling upgrade with a DC scheduling "reload" instead
934  	             * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
935  	             * so this would work for that purpose as well.
936  	             */
937  	            pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
938  	            report_remote_ra_result(cmd);
939  	        }
940  	
941  	        free_cmd(cmd);
942  	    }
943  	
944  	    return TRUE;
945  	}
946  	
947  	static void
948  	remote_ra_data_init(lrm_state_t * lrm_state)
949  	{
950  	    remote_ra_data_t *ra_data = NULL;
951  	
952  	    if (lrm_state->remote_ra_data) {
953  	        return;
954  	    }
955  	
956  	    ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
957  	    ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
958  	    lrm_state->remote_ra_data = ra_data;
959  	}
960  	
961  	void
962  	remote_ra_cleanup(lrm_state_t * lrm_state)
963  	{
964  	    if (lrm_state->remote_ra_data == NULL) {
965  	        return;
966  	    }
967  	
968  	    g_list_free_full(lrm_state->remote_ra_data->cmds, free_cmd);
969  	    g_list_free_full(lrm_state->remote_ra_data->recurring_cmds, free_cmd);
970  	    mainloop_destroy_trigger(lrm_state->remote_ra_data->work);
971  	    g_clear_pointer(&lrm_state->remote_ra_data, free);
972  	}
973  	
974  	gboolean
975  	is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
976  	{
977  	    if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
978  	        return TRUE;
979  	    }
980  	    return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
981  	           && !controld_is_local_node(id);
982  	}
983  	
984  	lrmd_rsc_info_t *
985  	remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
986  	{
987  	    lrmd_rsc_info_t *info = NULL;
988  	
989  	    CRM_CHECK(rsc_id != NULL, return NULL);
990  	
991  	    if (controld_get_executor_state(rsc_id, false) != NULL) {
992  	        info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
993  	
994  	        info->id = pcmk__str_copy(rsc_id);
995  	        info->type = pcmk__str_copy(REMOTE_LRMD_RA);
996  	        info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
997  	        info->provider = pcmk__str_copy("pacemaker");
998  	    }
999  	
1000 	    return info;
1001 	}
1002 	
1003 	static gboolean
1004 	is_remote_ra_supported_action(const char *action)
1005 	{
1006 	    return pcmk__str_any_of(action,
1007 	                            PCMK_ACTION_START,
1008 	                            PCMK_ACTION_STOP,
1009 	                            PCMK_ACTION_MONITOR,
1010 	                            PCMK_ACTION_MIGRATE_TO,
1011 	                            PCMK_ACTION_MIGRATE_FROM,
1012 	                            PCMK_ACTION_RELOAD_AGENT,
1013 	                            PCMK_ACTION_RELOAD,
1014 	                            NULL);
1015 	}
1016 	
1017 	static GList *
1018 	fail_all_monitor_cmds(GList * list)
1019 	{
1020 	    GList *rm_list = NULL;
1021 	    remote_ra_cmd_t *cmd = NULL;
1022 	    GList *gIter = NULL;
1023 	
1024 	    for (gIter = list; gIter != NULL; gIter = gIter->next) {
1025 	        cmd = gIter->data;
1026 	        if ((cmd->interval_ms > 0)
1027 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1028 	                            pcmk__str_casei)) {
1029 	            rm_list = g_list_append(rm_list, cmd);
1030 	        }
1031 	    }
1032 	
1033 	    for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1034 	        cmd = gIter->data;
1035 	
1036 	        pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1037 	                         PCMK_EXEC_ERROR, "Lost connection to remote executor");
1038 	        pcmk__trace("Pre-emptively failing %s %s (interval=%u, %s)",
1039 	                    cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1040 	        report_remote_ra_result(cmd);
1041 	
1042 	        list = g_list_remove(list, cmd);
1043 	        free_cmd(cmd);
1044 	    }
1045 	
1046 	    /* frees only the list data, not the cmds */
1047 	    g_list_free(rm_list);
1048 	    return list;
1049 	}
1050 	
1051 	static GList *
1052 	remove_cmd(GList * list, const char *action, guint interval_ms)
1053 	{
1054 	    remote_ra_cmd_t *cmd = NULL;
1055 	    GList *gIter = NULL;
1056 	
1057 	    for (gIter = list; gIter != NULL; gIter = gIter->next) {
1058 	        cmd = gIter->data;
1059 	        if ((cmd->interval_ms == interval_ms)
1060 	            && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1061 	            break;
1062 	        }
1063 	        cmd = NULL;
1064 	    }
1065 	    if (cmd) {
1066 	        list = g_list_remove(list, cmd);
1067 	        free_cmd(cmd);
1068 	    }
1069 	    return list;
1070 	}
1071 	
1072 	int
1073 	remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
1074 	                 const char *action, guint interval_ms)
1075 	{
1076 	    lrm_state_t *connection_rsc = NULL;
1077 	    remote_ra_data_t *ra_data = NULL;
1078 	
1079 	    CRM_CHECK(rsc_id != NULL, return -EINVAL);
1080 	
1081 	    connection_rsc = controld_get_executor_state(rsc_id, false);
1082 	    if (!connection_rsc || !connection_rsc->remote_ra_data) {
1083 	        return -EINVAL;
1084 	    }
1085 	
1086 	    ra_data = connection_rsc->remote_ra_data;
1087 	    ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1088 	    ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1089 	                                         interval_ms);
1090 	    if (ra_data->cur_cmd &&
1091 	        (ra_data->cur_cmd->interval_ms == interval_ms) &&
1092 	        (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1093 	
1094 	        cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1095 	    }
1096 	
1097 	    return 0;
1098 	}
1099 	
1100 	static remote_ra_cmd_t *
1101 	handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
1102 	                   const char *userdata)
1103 	{
1104 	    GList *gIter = NULL;
1105 	    remote_ra_cmd_t *cmd = NULL;
1106 	
1107 	    /* there are 3 places a potential duplicate monitor operation
1108 	     * could exist.
1109 	     * 1. recurring_cmds list. where the op is waiting for its next interval
1110 	     * 2. cmds list, where the op is queued to get executed immediately
1111 	     * 3. cur_cmd, which means the monitor op is in flight right now.
1112 	     */
1113 	    if (interval_ms == 0) {
1114 	        return NULL;
1115 	    }
1116 	
1117 	    if (ra_data->cur_cmd &&
1118 	        !pcmk__is_set(ra_data->cur_cmd->status, cmd_cancel)
1119 	        && (ra_data->cur_cmd->interval_ms == interval_ms)
1120 	        && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1121 	                        pcmk__str_casei)) {
1122 	
1123 	        cmd = ra_data->cur_cmd;
1124 	        goto handle_dup;
1125 	    }
1126 	
1127 	    for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1128 	        cmd = gIter->data;
1129 	        if ((cmd->interval_ms == interval_ms)
1130 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1131 	                            pcmk__str_casei)) {
1132 	            goto handle_dup;
1133 	        }
1134 	    }
1135 	
1136 	    for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1137 	        cmd = gIter->data;
1138 	        if ((cmd->interval_ms == interval_ms)
1139 	            && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1140 	                            pcmk__str_casei)) {
1141 	            goto handle_dup;
1142 	        }
1143 	    }
1144 	
1145 	    return NULL;
1146 	
1147 	handle_dup:
1148 	
1149 	    pcmk__trace("merging duplicate monitor cmd " PCMK__OP_FMT, cmd->rsc_id,
1150 	                PCMK_ACTION_MONITOR, interval_ms);
1151 	
1152 	    /* update the userdata */
1153 	    if (userdata) {
1154 	       free(cmd->userdata);
1155 	       cmd->userdata = pcmk__str_copy(userdata);
1156 	    }
1157 	
1158 	    /* if we've already reported success, generate a new call id */
1159 	    if (pcmk__is_set(cmd->status, cmd_reported_success)) {
1160 	        cmd->start_time = time(NULL);
1161 	        cmd->call_id = generate_callid();
1162 	        cmd_clear_flags(cmd, cmd_reported_success);
1163 	    }
1164 	
1165 	    /* if we have an interval_id set, that means we are in the process of
1166 	     * waiting for this cmd's next interval. instead of waiting, cancel
1167 	     * the timer and execute the action immediately */
1168 	    if (cmd->interval_id) {
1169 	        g_source_remove(cmd->interval_id);
1170 	        cmd->interval_id = 0;
1171 	        recurring_helper(cmd);
1172 	    }
1173 	
1174 	    return cmd;
1175 	}
1176 	
1177 	/*!
1178 	 * \internal
1179 	 * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1180 	 *
1181 	 * \param[in]     lrm_state      Executor state object for remote connection
1182 	 * \param[in]     rsc_id         Connection resource ID
1183 	 * \param[in]     action         Action to execute
1184 	 * \param[in]     userdata       String to copy and pass to execution callback
1185 	 * \param[in]     interval_ms    Action interval (in milliseconds)
1186 	 * \param[in]     timeout_ms     Action timeout (in milliseconds)
1187 	 * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1188 	 * \param[in,out] params         Connection resource parameters
1189 	 * \param[out]    call_id        Where to store call ID on success
1190 	 *
1191 	 * \return Standard Pacemaker return code
1192 	 * \note This takes ownership of \p params, which should not be used or freed
1193 	 *       after calling this function.
1194 	 */
1195 	int
1196 	controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
1197 	                              const char *action, const char *userdata,
1198 	                              guint interval_ms, int timeout_ms,
1199 	                              int start_delay_ms, lrmd_key_value_t *params,
1200 	                              int *call_id)
1201 	{
1202 	    lrm_state_t *connection_rsc = NULL;
1203 	    remote_ra_cmd_t *cmd = NULL;
1204 	    remote_ra_data_t *ra_data = NULL;
1205 	
1206 	    *call_id = 0;
1207 	
1208 	    CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1209 	              && (userdata != NULL) && (call_id != NULL),
1210 	              lrmd_key_value_freeall(params); return EINVAL);
1211 	
1212 	    if (!is_remote_ra_supported_action(action)) {
1213 	        lrmd_key_value_freeall(params);
1214 	        return EOPNOTSUPP;
1215 	    }
1216 	
1217 	    connection_rsc = controld_get_executor_state(rsc_id, false);
1218 	    if (connection_rsc == NULL) {
1219 	        lrmd_key_value_freeall(params);
1220 	        return ENOTCONN;
1221 	    }
1222 	
1223 	    remote_ra_data_init(connection_rsc);
1224 	    ra_data = connection_rsc->remote_ra_data;
1225 	
1226 	    cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1227 	    if (cmd) {
1228 	        *call_id = cmd->call_id;
1229 	        lrmd_key_value_freeall(params);
1230 	        return pcmk_rc_ok;
1231 	    }
1232 	
1233 	    cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1234 	
1235 	    cmd->owner = pcmk__str_copy(lrm_state->node_name);
1236 	    cmd->rsc_id = pcmk__str_copy(rsc_id);
1237 	    cmd->action = pcmk__str_copy(action);
1238 	    cmd->userdata = pcmk__str_copy(userdata);
1239 	    cmd->interval_ms = interval_ms;
1240 	    cmd->timeout = timeout_ms;
1241 	    cmd->start_delay = start_delay_ms;
1242 	    cmd->params = params;
1243 	    cmd->start_time = time(NULL);
1244 	
1245 	    cmd->call_id = generate_callid();
1246 	
1247 	    if (cmd->start_delay) {
1248 	        cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1249 	    }
1250 	
1251 	    ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1252 	    mainloop_set_trigger(ra_data->work);
1253 	
1254 	    *call_id = cmd->call_id;
1255 	    return pcmk_rc_ok;
1256 	}
1257 	
1258 	/*!
1259 	 * \internal
1260 	 * \brief Immediately fail all monitors of a remote node, if proxied here
1261 	 *
1262 	 * \param[in] node_name  Name of pacemaker_remote node
1263 	 */
1264 	void
1265 	remote_ra_fail(const char *node_name)
1266 	{
1267 	    lrm_state_t *lrm_state = NULL;
1268 	
1269 	    CRM_CHECK(node_name != NULL, return);
1270 	
1271 	    lrm_state = controld_get_executor_state(node_name, false);
1272 	    if (lrm_state && lrm_state_is_connected(lrm_state)) {
1273 	        remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1274 	
1275 	        pcmk__info("Failing monitors on Pacemaker Remote node %s", node_name);
1276 	        ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1277 	        ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1278 	    }
1279 	}
1280 	
1281 	/* A guest node fencing implied by host fencing looks like:
1282 	 *
1283 	 *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1284 	 *                on_node="lxc1" on_node_uuid="lxc1">
1285 	 *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1286 	 *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1287 	 *     <downed>
1288 	 *       <node id="lxc1"/>
1289 	 *     </downed>
1290 	 *  </pseudo_event>
1291 	 */
1292 	#define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1293 	    "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1294 	
1295 	/*!
1296 	 * \internal
1297 	 * \brief Check a pseudo-action for Pacemaker Remote node side effects
1298 	 *
1299 	 * \param[in,out] xml  XML of pseudo-action to check
1300 	 */
1301 	void
1302 	remote_ra_process_pseudo(xmlNode *xml)
1303 	{
1304 	    xmlXPathObject *search = pcmk__xpath_search(xml->doc, XPATH_PSEUDO_FENCE);
1305 	
1306 	    if (pcmk__xpath_num_results(search) == 1) {
1307 	        xmlNode *result = pcmk__xpath_result(search, 0);
1308 	
1309 	        /* Normally, we handle the necessary side effects of a guest node stop
1310 	         * action when reporting the remote agent's result. However, if the stop
1311 	         * is implied due to fencing, it will be a fencing pseudo-event, and
1312 	         * there won't be a result to report. Handle that case here.
1313 	         *
1314 	         * This will result in a duplicate call to remote_node_down() if the
1315 	         * guest stop was real instead of implied, but that shouldn't hurt.
1316 	         *
1317 	         * There is still one corner case that isn't handled: if a guest node
1318 	         * isn't running any resources when its host is fenced, it will appear
1319 	         * to be cleanly stopped, so there will be no pseudo-fence, and our
1320 	         * peer cache state will be incorrect unless and until the guest is
1321 	         * recovered.
1322 	         */
1323 	        if (result != NULL) {
1324 	            const char *remote = pcmk__xe_id(result);
1325 	
1326 	            if (remote != NULL) {
1327 	                remote_node_down(remote, true);
1328 	            }
1329 	        }
1330 	    }
1331 	    xmlXPathFreeObject(search);
1332 	}
1333 	
1334 	static void
1335 	remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
1336 	{
1337 	    xmlNode *update, *state;
1338 	    int call_opt;
1339 	    pcmk__node_status_t *node = NULL;
1340 	
1341 	    call_opt = crmd_cib_smart_opt();
1342 	    node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1343 	    CRM_CHECK(node != NULL, return);
1344 	    update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1345 	    state = create_node_state_update(node, controld_node_update_none, update,
1346 	                                     __func__);
1347 	    pcmk__xe_set(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1348 	    if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1349 	                            NULL) == pcmk_rc_ok) {
1350 	        /* TODO: still not 100% sure that async update will succeed ... */
1351 	        if (maintenance) {
1352 	            lrm_remote_set_flags(lrm_state, remote_in_maint);
1353 	        } else {
1354 	            lrm_remote_clear_flags(lrm_state, remote_in_maint);
1355 	        }
1356 	    }
1357 	    pcmk__xml_free(update);
1358 	}
1359 	
1360 	#define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1361 	    "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1362 	    PCMK__XE_MAINTENANCE
1363 	
1364 	/*!
1365 	 * \internal
1366 	 * \brief Check a pseudo-action holding updates for maintenance state
1367 	 *
1368 	 * \param[in,out] xml  XML of pseudo-action to check
1369 	 */
1370 	void
1371 	remote_ra_process_maintenance_nodes(xmlNode *xml)
1372 	{
1373 	    xmlNode *maint = pcmk__xpath_find_one(xml->doc, XPATH_PSEUDO_MAINTENANCE,
1374 	                                          PCMK__LOG_NEVER);
1375 	
1376 	    for (xmlNode *node = pcmk__xe_first_child(maint, PCMK_XE_NODE, NULL, NULL);
1377 	         node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1378 	
1379 	        lrm_state_t *lrm_state = NULL;
1380 	        const char *id = pcmk__xe_id(node);
1381 	
1382 	        if (id == NULL) {
1383 	            CRM_LOG_ASSERT(id != NULL);
1384 	            continue;
1385 	        }
1386 	
1387 	        lrm_state = controld_get_executor_state(id, false);
1388 	
1389 	        if ((lrm_state != NULL) && (lrm_state->remote_ra_data != NULL)
1390 	            && pcmk__is_set(lrm_state->remote_ra_data->status, remote_active)) {
1391 	
1392 	            const bool in_maint =
1393 	                pcmk__xe_attr_is_true(node, PCMK__XA_NODE_IN_MAINTENANCE);
1394 	
1395 	            remote_ra_maintenance(lrm_state, in_maint);
1396 	        }
1397 	    }
1398 	}
1399 	
1400 	gboolean
1401 	remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
1402 	{
1403 	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1404 	    return pcmk__is_set(ra_data->status, remote_in_maint);
1405 	}
1406 	
1407 	gboolean
1408 	remote_ra_controlling_guest(lrm_state_t * lrm_state)
1409 	{
1410 	    remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1411 	    return pcmk__is_set(ra_data->status, controlling_guest);
1412 	}
1413