1    	/*
2    	 * Copyright 2004-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <stdbool.h>
13   	#include <unistd.h>  /* pid_t, sleep, ssize_t */
14   	
15   	#include <crm/cib.h>
16   	#include <crm/cluster.h>
17   	#include <crm/common/xml.h>
18   	#include <crm/crm.h>
19   	#include <crm/common/ipc.h>
20   	#include <crm/common/ipc_schedulerd.h>
21   	
22   	#include <libxml/xpath.h>               // xmlXPathObject, etc.
23   	
24   	#include <pacemaker-controld.h>
25   	
26   	static pcmk_ipc_api_t *schedulerd_api = NULL;
27   	
28   	static mainloop_timer_t *controld_cib_retry_timer = NULL;
29   	
30   	/*!
31   	 * \internal
32   	 * \brief Save CIB query result to file, raising FSA error
33   	 *
34   	 * \param[in] msg        Ignored
35   	 * \param[in] call_id    Call ID of CIB query
36   	 * \param[in] rc         Return code of CIB query
37   	 * \param[in] output     Result of CIB query
38   	 * \param[in] user_data  Unique identifier for filename
39   	 *
40   	 * \note This is intended to be called after a scheduler connection fails.
41   	 */
42   	static void
43   	save_cib_contents(xmlNode *msg, int call_id, int rc, xmlNode *output,
44   	                  void *user_data)
45   	{
46   	    const char *id = user_data;
47   	
48   	    register_fsa_error(I_ERROR, NULL);
49   	    CRM_CHECK(id != NULL, return);
50   	
51   	    if (rc == pcmk_ok) {
52   	        char *filename = pcmk__assert_asprintf(PCMK_SCHEDULER_INPUT_DIR
53   	                                               "/pe-core-%s.bz2",
54   	                                               id);
55   	
56   	        if (pcmk__xml_write_file(output, filename, true) != pcmk_rc_ok) {
57   	            pcmk__err("Could not save CIB to %s after scheduler crash",
58   	                      filename);
59   	        } else {
60   	            pcmk__notice("Saved CIB to %s after scheduler crash", filename);
61   	        }
62   	        free(filename);
63   	    }
64   	}
65   	
66   	/*!
67   	 * \internal
68   	 * \brief Respond to scheduler connection failure
69   	 */
70   	static void
71   	handle_disconnect(void)
72   	{
73   	    // If we aren't connected to the scheduler, we can't expect a reply
74   	    controld_expect_sched_reply(NULL);
75   	
76   	    if (pcmk__is_set(controld_globals.fsa_input_register, R_PE_REQUIRED)) {
77   	        int rc = pcmk_ok;
78   	        char *uuid_str = pcmk__generate_uuid();
79   	
80   	        pcmk__crit("Lost connection to the scheduler "
81   	                   QB_XS " CIB will be saved to "
82   	                   PCMK_SCHEDULER_INPUT_DIR "/pe-core-%s.bz2",
83   	                   uuid_str);
84   	
85   	        /* Save the current CIB so that we have a chance of figuring out what
86   	         * killed the scheduler.
87   	         *
88   	         * Delay registering an I_ERROR until the query completes or times out.
89   	         */
90   	        rc = controld_globals.cib_conn->cmds->query(controld_globals.cib_conn,
91   	                                                    NULL, NULL, cib_none);
92   	        fsa_register_cib_callback(rc, uuid_str, save_cib_contents);
93   	    }
94   	
95   	    controld_clear_fsa_input_flags(R_PE_CONNECTED);
96   	    controld_trigger_fsa();
97   	}
98   	
99   	static void
100  	handle_reply(pcmk_schedulerd_api_reply_t *reply)
101  	{
102  	    const char *msg_ref = NULL;
103  	
104  	    if (!AM_I_DC) {
105  	        return;
106  	    }
107  	
108  	    pcmk__assert(reply != NULL);
109  	    msg_ref = reply->data.graph.reference;
110  	
111  	    if (msg_ref == NULL) {
112  	        pcmk__err(CRM_OP_PECALC " - Ignoring calculation with no reference");
113  	
114  	    } else if (pcmk__str_eq(msg_ref, controld_globals.fsa_pe_ref,
115  	                            pcmk__str_none)) {
116  	        ha_msg_input_t fsa_input = { NULL, NULL };
117  	        xmlNode *crm_data_node = NULL;
118  	
119  	        controld_stop_sched_timer();
120  	
121  	        /* do_te_invoke() (which will eventually process the fsa_input we are
122  	         * constructing here) requires that fsa_input.xml be non-NULL. That will
123  	         * happen only if copy_ha_msg_input() (which is called by
124  	         * register_fsa_input_adv()) sees the fsa_input.msg that it is
125  	         * expecting. The scheduler's IPC dispatch function gave us the values
126  	         * we need, so we just need to put them into XML.
127  	         *
128  	         * The name of the top-level element here is irrelevant. Nothing checks
129  	         * it.
130  	         */
131  	        fsa_input.msg = pcmk__xe_create(NULL, "dummy-reply");
132  	        pcmk__xe_set(fsa_input.msg, PCMK_XA_REFERENCE, msg_ref);
133  	        pcmk__xe_set(fsa_input.msg, PCMK__XA_CRM_TGRAPH_IN,
134  	                     reply->data.graph.input);
135  	
136  	        crm_data_node = pcmk__xe_create(fsa_input.msg, PCMK__XE_CRM_XML);
137  	        pcmk__xml_copy(crm_data_node, reply->data.graph.tgraph);
138  	        controld_fsa_append(C_IPC_MESSAGE, I_PE_SUCCESS, &fsa_input);
139  	
140  	        pcmk__xml_free(fsa_input.msg);
141  	
142  	    } else {
143  	        pcmk__info("%s calculation %s is obsolete", CRM_OP_PECALC, msg_ref);
144  	    }
145  	}
146  	
147  	static void
148  	scheduler_event_callback(pcmk_ipc_api_t *api, enum pcmk_ipc_event event_type,
149  	                         crm_exit_t status, void *event_data, void *user_data)
150  	{
151  	    switch (event_type) {
152  	        case pcmk_ipc_event_disconnect:
153  	            handle_disconnect();
154  	            break;
155  	
156  	        case pcmk_ipc_event_reply:
157  	            handle_reply((pcmk_schedulerd_api_reply_t *) event_data);
158  	            break;
159  	
160  	        default:
161  	            break;
162  	    }
163  	}
164  	
165  	static bool
166  	new_schedulerd_ipc_connection(void)
167  	{
168  	    int rc = pcmk_rc_ok;
169  	
170  	    controld_set_fsa_input_flags(R_PE_REQUIRED);
171  	
172  	    if (schedulerd_api == NULL) {
173  	        rc = pcmk_new_ipc_api(&schedulerd_api, pcmk_ipc_schedulerd);
174  	
175  	        if (rc != pcmk_rc_ok) {
176  	            pcmk__err("Error connecting to the scheduler: %s", pcmk_rc_str(rc));
177  	            return false;
178  	        }
179  	    }
180  	
181  	    pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
182  	
183  	    rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api,
184  	                                            pcmk_ipc_dispatch_main, 3);
185  	    if (rc != pcmk_rc_ok) {
186  	        pcmk__err("Error connecting to %s: %s",
187  	                  pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
188  	        return false;
189  	    }
190  	
191  	    controld_set_fsa_input_flags(R_PE_CONNECTED);
192  	    return true;
193  	}
194  	
195  	/*!
196  	 * \internal
197  	 * \brief Close any scheduler connection and free associated memory
198  	 */
199  	void
200  	controld_shutdown_schedulerd_ipc(void)
201  	{
202  	    controld_clear_fsa_input_flags(R_PE_REQUIRED);
203  	    pcmk_disconnect_ipc(schedulerd_api);
204  	    handle_disconnect();
205  	    g_clear_pointer(&schedulerd_api, pcmk_free_ipc_api);
206  	}
207  	
208  	static void do_pe_invoke_callback(xmlNode *msg, int call_id, int rc,
209  	                                  xmlNode *output, void *user_data);
210  	
211  	// A_PE_START, A_PE_STOP, O_PE_RESTART
212  	void
213  	do_pe_control(long long action, enum crmd_fsa_cause cause,
214  	              enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input,
215  	              fsa_data_t *msg_data)
216  	{
217  	    if (pcmk__is_set(action, A_PE_STOP)) {
218  	        controld_clear_fsa_input_flags(R_PE_REQUIRED);
219  	        pcmk_disconnect_ipc(schedulerd_api);
220  	        handle_disconnect();
221  	    }
222  	
223  	    if (pcmk__is_set(action, A_PE_START)
224  	        && !pcmk__is_set(controld_globals.fsa_input_register, R_PE_CONNECTED)) {
225  	
226  	        if (cur_state == S_STOPPING) {
227  	            pcmk__info("Ignoring request to connect to scheduler while "
228  	                       "shutting down");
229  	
230  	        } else if (!new_schedulerd_ipc_connection()) {
231  	            pcmk__warn("Could not connect to scheduler");
232  	            register_fsa_error(I_FAIL, msg_data);
233  	        }
234  	    }
235  	}
236  	
237  	static int fsa_pe_query = 0;
238  	static mainloop_timer_t *controld_sched_timer = NULL;
239  	
240  	// @TODO Make this a configurable cluster option if there's demand for it
241  	#define SCHED_TIMEOUT_MS (120000)
242  	
243  	/*!
244  	 * \internal
245  	 * \brief Handle a timeout waiting for scheduler reply
246  	 *
247  	 * \param[in] user_data  Ignored
248  	 *
249  	 * \return FALSE (indicating that timer should not be restarted)
250  	 */
251  	static gboolean
252  	controld_sched_timeout(gpointer user_data)
253  	{
254  	    if (AM_I_DC) {
255  	        /* If this node is the DC but can't communicate with the scheduler, just
256  	         * exit (and likely get fenced) so this node doesn't interfere with any
257  	         * further DC elections.
258  	         *
259  	         * @TODO We could try something less drastic first, like disconnecting
260  	         * and reconnecting to the scheduler, but something is likely going
261  	         * seriously wrong, so perhaps it's better to just fail as quickly as
262  	         * possible.
263  	         */
264  	        crmd_exit(CRM_EX_FATAL);
265  	    }
266  	    return FALSE;
267  	}
268  	
269  	void
270  	controld_stop_sched_timer(void)
271  	{
272  	    if ((controld_sched_timer != NULL)
273  	        && (controld_globals.fsa_pe_ref != NULL)) {
274  	        pcmk__trace("Stopping timer for scheduler reply %s",
275  	                    controld_globals.fsa_pe_ref);
276  	    }
277  	    mainloop_timer_stop(controld_sched_timer);
278  	}
279  	
280  	/*!
281  	 * \internal
282  	 * \brief Set the scheduler request currently being waited on
283  	 *
284  	 * \param[in] ref  Request to expect reply to (or NULL for none)
285  	 *
286  	 * \note This function takes ownership of \p ref.
287  	 */
288  	void
289  	controld_expect_sched_reply(char *ref)
290  	{
291  	    if (ref) {
292  	        if (controld_sched_timer == NULL) {
293  	            controld_sched_timer = mainloop_timer_add("scheduler_reply_timer",
294  	                                                      SCHED_TIMEOUT_MS, FALSE,
295  	                                                      controld_sched_timeout,
296  	                                                      NULL);
297  	        }
298  	        mainloop_timer_start(controld_sched_timer);
299  	    } else {
300  	        controld_stop_sched_timer();
301  	    }
302  	    free(controld_globals.fsa_pe_ref);
303  	    controld_globals.fsa_pe_ref = ref;
304  	}
305  	
306  	/*!
307  	 * \internal
308  	 * \brief Free the scheduler reply timer
309  	 */
310  	void
311  	controld_free_sched_timer(void)
312  	{
CID (unavailable; MK=bfc207524bf2ec5ae9792502022cd658) (#1 of 1): Inconsistent C union access (INCONSISTENT_UNION_ACCESS):
(1) Event assign_union_field: The union field "in" of "_pp" is written.
(2) Event inconsistent_union_field_access: In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in".
313  	    g_clear_pointer(&controld_sched_timer, mainloop_timer_del);
314  	}
315  	
316  	// A_PE_INVOKE
317  	void
318  	do_pe_invoke(long long action, enum crmd_fsa_cause cause,
319  	             enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input,
320  	             fsa_data_t *msg_data)
321  	{
322  	    cib_t *cib_conn = controld_globals.cib_conn;
323  	
324  	    if (!AM_I_DC) {
325  	        pcmk__err("Not invoking scheduler because not DC: %s",
326  	                  fsa_action2string(action));
327  	        return;
328  	    }
329  	
330  	    if (!pcmk__is_set(controld_globals.fsa_input_register, R_PE_CONNECTED)) {
331  	        if (pcmk__is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
332  	            pcmk__err("Cannot shut down gracefully without the scheduler");
333  	            controld_fsa_prepend(C_FSA_INTERNAL, I_TERMINATE, NULL);
334  	
335  	        } else {
336  	            pcmk__info("Waiting for the scheduler to connect");
337  	            controld_fsa_stall(msg_data, action);
338  	            controld_set_fsa_action_flags(A_PE_START);
339  	            controld_trigger_fsa();
340  	        }
341  	        return;
342  	    }
343  	
344  	    if (cur_state != S_POLICY_ENGINE) {
345  	        pcmk__notice("Not invoking scheduler because in state %s",
346  	                     fsa_state2string(cur_state));
347  	        return;
348  	    }
349  	
350  	    if (!pcmk__is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) {
351  	        pcmk__err("Attempted to invoke scheduler without consistent CIB");
352  	
353  	        // Start the join from scratch
354  	        controld_fsa_prepend(C_FSA_INTERNAL, I_ELECTION, NULL);
355  	        return;
356  	    }
357  	
358  	    if (controld_cib_retry_timer != NULL) {
359  	        pcmk__debug("Not invoking scheduler until CIB retry timer expires");
360  	        return;
361  	    }
362  	
363  	    fsa_pe_query = cib_conn->cmds->query(cib_conn, NULL, NULL, cib_none);
364  	
365  	    pcmk__debug("Query %d: Requesting the current CIB: %s", fsa_pe_query,
366  	                fsa_state2string(controld_globals.fsa_state));
367  	
368  	    controld_expect_sched_reply(NULL);
369  	    fsa_register_cib_callback(fsa_pe_query, NULL, do_pe_invoke_callback);
370  	}
371  	
372  	static void
373  	force_local_option(xmlNode *xml, const char *attr_name, const char *attr_value)
374  	{
375  	    int max = 0;
376  	    int lpc = 0;
377  	    const char *xpath_base = NULL;
378  	    char *xpath_string = NULL;
379  	    xmlXPathObject *xpathObj = NULL;
380  	
381  	    xpath_base = pcmk_cib_xpath_for(PCMK_XE_CRM_CONFIG);
382  	    if (xpath_base == NULL) {
383  	        pcmk__err(PCMK_XE_CRM_CONFIG " CIB element not known (bug?)");
384  	        return;
385  	    }
386  	
387  	    xpath_string = pcmk__assert_asprintf("%s//%s//nvpair[@name='%s']",
388  	                                         xpath_base,
389  	                                         PCMK_XE_CLUSTER_PROPERTY_SET,
390  	                                         attr_name);
391  	    xpathObj = pcmk__xpath_search(xml->doc, xpath_string);
392  	    max = pcmk__xpath_num_results(xpathObj);
393  	    free(xpath_string);
394  	
395  	    for (lpc = 0; lpc < max; lpc++) {
396  	        xmlNode *match = pcmk__xpath_result(xpathObj, lpc);
397  	
398  	        if (match == NULL) {
399  	            continue;
400  	        }
401  	        pcmk__trace("Forcing %s/%s = %s", pcmk__xe_id(match), attr_name,
402  	                    attr_value);
403  	        pcmk__xe_set(match, PCMK_XA_VALUE, attr_value);
404  	    }
405  	
406  	    if(max == 0) {
407  	        xmlNode *configuration = NULL;
408  	        xmlNode *crm_config = NULL;
409  	        xmlNode *cluster_property_set = NULL;
410  	
411  	        pcmk__trace("Creating " PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS "-%s for "
412  	                    "%s=%s",
413  	                    attr_name, attr_name, attr_value);
414  	
415  	        configuration = pcmk__xe_first_child(xml, PCMK_XE_CONFIGURATION, NULL,
416  	                                             NULL);
417  	        if (configuration == NULL) {
418  	            configuration = pcmk__xe_create(xml, PCMK_XE_CONFIGURATION);
419  	        }
420  	
421  	        crm_config = pcmk__xe_first_child(configuration, PCMK_XE_CRM_CONFIG,
422  	                                          NULL, NULL);
423  	        if (crm_config == NULL) {
424  	            crm_config = pcmk__xe_create(configuration, PCMK_XE_CRM_CONFIG);
425  	        }
426  	
427  	        cluster_property_set =
428  	            pcmk__xe_first_child(crm_config, PCMK_XE_CLUSTER_PROPERTY_SET, NULL,
429  	                                 NULL);
430  	        if (cluster_property_set == NULL) {
431  	            cluster_property_set =
432  	                pcmk__xe_create(crm_config, PCMK_XE_CLUSTER_PROPERTY_SET);
433  	            pcmk__xe_set(cluster_property_set, PCMK_XA_ID,
434  	                         PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS);
435  	        }
436  	
437  	        xml = pcmk__xe_create(cluster_property_set, PCMK_XE_NVPAIR);
438  	
439  	        pcmk__xe_set_id(xml, "%s-%s",
440  	                        PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS, attr_name);
441  	        pcmk__xe_set(xml, PCMK_XA_NAME, attr_name);
442  	        pcmk__xe_set(xml, PCMK_XA_VALUE, attr_value);
443  	    }
444  	    xmlXPathFreeObject(xpathObj);
445  	}
446  	
447  	static gboolean
448  	sleep_timer(gpointer data)
449  	{
450  	    controld_set_fsa_action_flags(A_PE_INVOKE);
451  	    controld_trigger_fsa();
452  	    g_clear_pointer(&controld_cib_retry_timer, mainloop_timer_del);
453  	    return G_SOURCE_REMOVE;
454  	}
455  	
456  	static void
457  	do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
458  	{
459  	    char *ref = NULL;
460  	    pid_t watchdog = pcmk__locate_sbd();
461  	
462  	    if (rc != pcmk_ok) {
463  	        pcmk__err("Could not retrieve the CIB: %s " QB_XS " rc=%d call=%d",
464  	                  pcmk_strerror(rc), rc, call_id);
465  	        register_fsa_error(I_ERROR, NULL);
466  	        return;
467  	
468  	    } else if (call_id != fsa_pe_query) {
469  	        pcmk__trace("Skipping superseded CIB query: %d (current=%d)", call_id,
470  	                    fsa_pe_query);
471  	        return;
472  	
473  	    } else if (!AM_I_DC
474  	               || !pcmk__is_set(controld_globals.fsa_input_register,
475  	                                R_PE_CONNECTED)) {
476  	        pcmk__debug("No need to invoke the scheduler anymore");
477  	        return;
478  	
479  	    } else if (controld_globals.fsa_state != S_POLICY_ENGINE) {
480  	        pcmk__debug("Discarding scheduler request in state: %s",
481  	                    fsa_state2string(controld_globals.fsa_state));
482  	        return;
483  	
484  	    /* this callback counts as 1 */
485  	    } else if (num_cib_op_callbacks() > 1) {
486  	        pcmk__debug("Re-asking for the CIB: %d other peer updates still "
487  	                    "pending", (num_cib_op_callbacks() - 1));
488  	
489  	        controld_cib_retry_timer = mainloop_timer_add("cib_retry", 1000, false,
490  	                                                      sleep_timer, NULL);
491  	        mainloop_timer_start(controld_cib_retry_timer);
492  	        return;
493  	    }
494  	
495  	    CRM_LOG_ASSERT(output != NULL);
496  	
497  	    /* Refresh the remote node cache and the known node cache when the
498  	     * scheduler is invoked */
499  	    pcmk__refresh_node_caches_from_cib(output);
500  	
501  	    pcmk__xe_set(output, PCMK_XA_DC_UUID, controld_globals.our_uuid);
502  	    pcmk__xe_set_bool(output, PCMK_XA_HAVE_QUORUM,
503  	                      pcmk__is_set(controld_globals.flags,
504  	                                   controld_has_quorum));
505  	
506  	    force_local_option(output, PCMK_OPT_HAVE_WATCHDOG, pcmk__btoa(watchdog));
507  	
508  	    if (pcmk__is_set(controld_globals.flags, controld_ever_had_quorum)
509  	        && !pcmk__cluster_has_quorum()) {
510  	
511  	        pcmk__xe_set_int(output, PCMK_XA_NO_QUORUM_PANIC, 1);
512  	    }
513  	
514  	    rc = pcmk_schedulerd_api_graph(schedulerd_api, output, &ref);
515  	    if (rc != pcmk_rc_ok) {
516  	        free(ref);
517  	        pcmk__err("Could not contact the scheduler: %s " QB_XS " rc=%d",
518  	                  pcmk_rc_str(rc), rc);
519  	        register_fsa_error(I_ERROR, NULL);
520  	
521  	    } else {
522  	        pcmk__assert(ref != NULL);
523  	        controld_expect_sched_reply(ref);
524  	        pcmk__debug("Invoking the scheduler: query=%d, ref=%s, seq=%llu, "
525  	                    "quorate=%s",
526  	                    fsa_pe_query, controld_globals.fsa_pe_ref,
527  	                    controld_globals.peer_seq,
528  	                    pcmk__flag_text(controld_globals.flags,
529  	                                    controld_has_quorum));
530  	    }
531  	}
532