1    	/*
2    	 * Copyright 2013-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU General Public License version 2
7    	 * or later (GPLv2+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <sys/types.h>
13   	#include <sys/stat.h>
14   	
15   	#include <unistd.h>
16   	#include <ctype.h>
17   	#include <dirent.h>
18   	
19   	#include <crm/crm.h>
20   	#include <crm/common/xml.h>
21   	#include <crm/cluster.h>
22   	
23   	#include <pacemaker-controld.h>
24   	
25   	/* These values don't need to be bits, but these particular values must be kept
26   	 * for backward compatibility during rolling upgrades.
27   	 */
28   	enum throttle_state_e {
29   	    throttle_none       = 0x0000,
30   	    throttle_low        = 0x0001,
31   	    throttle_med        = 0x0010,
32   	    throttle_high       = 0x0100,
33   	    throttle_extreme    = 0x1000,
34   	};
35   	
36   	struct throttle_record_s {
37   	    int max;
38   	    enum throttle_state_e mode;
39   	    char *node;
40   	};
41   	
42   	static int throttle_job_max = 0;
43   	static float throttle_load_target = 0.0;
44   	
45   	#define THROTTLE_FACTOR_LOW    1.2
46   	#define THROTTLE_FACTOR_MEDIUM 1.6
47   	#define THROTTLE_FACTOR_HIGH   2.0
48   	
49   	static GHashTable *throttle_records = NULL;
50   	static mainloop_timer_t *throttle_timer = NULL;
51   	
52   	static const char *
53   	load2str(enum throttle_state_e mode)
54   	{
55   	    switch (mode) {
56   	        case throttle_extreme:  return "extreme";
57   	        case throttle_high:     return "high";
58   	        case throttle_med:      return "medium";
59   	        case throttle_low:      return "low";
60   	        case throttle_none:     return "negligible";
61   	        default:                return "undetermined";
62   	    }
63   	}
64   	
65   	/*!
66   	 * \internal
67   	 * \brief Check a load value against throttling thresholds
68   	 *
69   	 * \param[in] load        Load value to check
70   	 * \param[in] desc        Description of metric (for logging)
71   	 * \param[in] thresholds  Low/medium/high/extreme thresholds
72   	 *
73   	 * \return Throttle mode corresponding to load value
74   	 */
75   	static enum throttle_state_e
76   	throttle_check_thresholds(float load, const char *desc,
77   	                          const float thresholds[4])
78   	{
79   	    if (load > thresholds[3]) {
80   	        pcmk__notice("Extreme %s detected: %f", desc, load);
81   	        return throttle_extreme;
82   	
83   	    } else if (load > thresholds[2]) {
84   	        pcmk__notice("High %s detected: %f", desc, load);
85   	        return throttle_high;
86   	
87   	    } else if (load > thresholds[1]) {
88   	        pcmk__info("Moderate %s detected: %f", desc, load);
89   	        return throttle_med;
90   	
91   	    } else if (load > thresholds[0]) {
92   	        pcmk__debug("Noticeable %s detected: %f", desc, load);
93   	        return throttle_low;
94   	    }
95   	
96   	    pcmk__trace("Negligible %s detected: %f", desc, load);
97   	    return throttle_none;
98   	}
99   	
100  	static enum throttle_state_e
101  	throttle_handle_load(float load, const char *desc, int cores)
102  	{
103  	    float normalize;
104  	    float thresholds[4];
105  	
106  	    if (cores == 1) {
107  	        /* On a single core machine, a load of 1.0 is already too high */
108  	        normalize = 0.6;
109  	
110  	    } else {
111  	        /* Normalize the load to be per-core */
112  	        normalize = cores;
113  	    }
114  	    thresholds[0] = throttle_load_target * normalize * THROTTLE_FACTOR_LOW;
115  	    thresholds[1] = throttle_load_target * normalize * THROTTLE_FACTOR_MEDIUM;
116  	    thresholds[2] = throttle_load_target * normalize * THROTTLE_FACTOR_HIGH;
117  	    thresholds[3] = load + 1.0; /* never extreme */
118  	
119  	    return throttle_check_thresholds(load, desc, thresholds);
120  	}
121  	
122  	static enum throttle_state_e
123  	throttle_mode(void)
124  	{
125  	    enum throttle_state_e mode = throttle_none;
126  	
127  	    unsigned int cores = pcmk__procfs_num_cores();
128  	    float load;
129  	    float thresholds[4];
130  	
131  	    if (pcmk__throttle_cib_load(PCMK__SERVER_BASED, &load)) {
132  	        float cib_max_cpu = 0.95;
133  	
134  	        /* The CIB is a single-threaded task and thus cannot consume more
135  	         * than 100% of a CPU (and 1/cores of the overall system load).
136  	         *
137  	         * On a many-cored system, the CIB might therefore be maxed out (causing
138  	         * operations to fail or appear to fail) even though the overall system
139  	         * load is still reasonable.
140  	         *
141  	         * Therefore, the 'normal' thresholds can not apply here, and we need a
142  	         * special case.
143  	         */
144  	        if (cores == 1) {
145  	            cib_max_cpu = 0.4;
146  	        }
147  	        if ((throttle_load_target > 0.0) && (throttle_load_target < cib_max_cpu)) {
148  	            cib_max_cpu = throttle_load_target;
149  	        }
150  	
151  	        thresholds[0] = cib_max_cpu * 0.8;
152  	        thresholds[1] = cib_max_cpu * 0.9;
153  	        thresholds[2] = cib_max_cpu;
154  	        /* Can only happen on machines with a low number of cores */
155  	        thresholds[3] = cib_max_cpu * 1.5;
156  	
157  	        mode = throttle_check_thresholds(load, "CIB load", thresholds);
158  	    }
159  	
160  	    if (throttle_load_target <= 0) {
161  	        /* If we ever make this a valid value, the cluster will at least behave
162  	         * as expected
163  	         */
164  	        return mode;
165  	    }
166  	
167  	    if (pcmk__throttle_load_avg(&load)) {
168  	        enum throttle_state_e cpu_load;
169  	
170  	        cpu_load = throttle_handle_load(load, "CPU load", cores);
171  	        if (cpu_load > mode) {
172  	            mode = cpu_load;
173  	        }
174  	        pcmk__debug("Current load is %f across %u core(s)", load, cores);
175  	    }
176  	
177  	    return mode;
178  	}
179  	
180  	static void
181  	throttle_send_command(enum throttle_state_e mode)
182  	{
183  	    xmlNode *xml = NULL;
184  	    static enum throttle_state_e last = -1;
185  	
186  	    if(mode != last) {
187  	        pcmk__info("New throttle mode: %s load (was %s)", load2str(mode),
188  	                   load2str(last));
189  	        last = mode;
190  	
191  	        xml = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD, NULL,
192  	                                CRM_SYSTEM_CRMD, CRM_OP_THROTTLE, NULL);
193  	        pcmk__xe_set_int(xml, PCMK__XA_CRM_LIMIT_MODE, mode);
194  	        pcmk__xe_set_int(xml, PCMK__XA_CRM_LIMIT_MAX, throttle_job_max);
195  	
196  	        pcmk__cluster_send_message(NULL, pcmk_ipc_controld, xml);
197  	        pcmk__xml_free(xml);
198  	    }
199  	}
200  	
201  	static gboolean
202  	throttle_timer_cb(gpointer data)
203  	{
204  	    throttle_send_command(throttle_mode());
205  	    return TRUE;
206  	}
207  	
208  	static void
209  	throttle_record_free(gpointer p)
210  	{
211  	    struct throttle_record_s *r = p;
212  	    free(r->node);
213  	    free(r);
214  	}
215  	
216  	static void
217  	throttle_set_load_target(float target)
218  	{
219  	    throttle_load_target = target;
220  	}
221  	
222  	/*!
223  	 * \internal
224  	 * \brief Update the maximum number of simultaneous jobs
225  	 *
226  	 * \param[in] preference  Cluster-wide \c PCMK_OPT_NODE_ACTION_LIMIT from the
227  	 *                        CIB
228  	 */
229  	static void
230  	throttle_update_job_max(const char *preference)
231  	{
232  	    long long max = 0LL;
233  	
234  	    // Per-node override
235  	    const char *env_limit = pcmk__env_option(PCMK__ENV_NODE_ACTION_LIMIT);
236  	
237  	    if (env_limit != NULL) {
238  	        int rc = pcmk__scan_ll(env_limit, &max, 0LL);
239  	
240  	        if (rc != pcmk_rc_ok) {
241  	            pcmk__warn("Ignoring local option PCMK_" PCMK__ENV_NODE_ACTION_LIMIT
242  	                       " because '%s' is not a valid value: %s",
243  	                       env_limit, pcmk_rc_str(rc));
244  	            env_limit = NULL;
245  	        }
246  	    }
247  	    if (env_limit == NULL) {
248  	        // Option validator should prevent invalid values
249  	        CRM_LOG_ASSERT(pcmk__scan_ll(preference, &max, 0LL) == pcmk_rc_ok);
250  	    }
251  	
252  	    if (max > 0) {
253  	        throttle_job_max = (max >= INT_MAX)? INT_MAX : (int) max;
254  	    } else {
255  	        // Default is based on the number of cores detected
256  	        throttle_job_max = 2 * pcmk__procfs_num_cores();
257  	    }
258  	}
259  	
260  	void
261  	throttle_init(void)
262  	{
263  	    if(throttle_records == NULL) {
264  	        throttle_records = pcmk__strkey_table(NULL, throttle_record_free);
265  	        throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL);
266  	    }
267  	
268  	    throttle_update_job_max(NULL);
269  	    mainloop_timer_start(throttle_timer);
270  	}
271  	
272  	/*!
273  	 * \internal
274  	 * \brief Configure throttle options based on the CIB
275  	 *
276  	 * \param[in,out] options  Name/value pairs for configured options
277  	 */
278  	void
279  	controld_configure_throttle(GHashTable *options)
280  	{
281  	    const char *value = g_hash_table_lookup(options, PCMK_OPT_LOAD_THRESHOLD);
282  	
283  	    if (value != NULL) {
284  	        throttle_set_load_target(strtof(value, NULL) / 100.0);
285  	    }
286  	
287  	    value = g_hash_table_lookup(options, PCMK_OPT_NODE_ACTION_LIMIT);
288  	    throttle_update_job_max(value);
289  	}
290  	
291  	void
292  	throttle_fini(void)
293  	{
(1) Event path: Condition "_p", taking true branch.
294  	    g_clear_pointer(&throttle_timer, mainloop_timer_del);
CID (unavailable; MK=879dab1d1d2fdadc73872eb5064eb42a) (#2 of 2): Inconsistent C union access (INCONSISTENT_UNION_ACCESS):
(2) Event assign_union_field: The union field "in" of "_pp" is written.
(3) Event inconsistent_union_field_access: In "_pp.out", the union field used: "out" is inconsistent with the field most recently stored: "in".
295  	    g_clear_pointer(&throttle_records, g_hash_table_destroy);
296  	}
297  	
298  	int
299  	throttle_get_total_job_limit(int l)
300  	{
301  	    /* Cluster-wide limit */
302  	    GHashTableIter iter;
303  	    int limit = l;
304  	    int peers = pcmk__cluster_num_active_nodes();
305  	    struct throttle_record_s *r = NULL;
306  	
307  	    g_hash_table_iter_init(&iter, throttle_records);
308  	
309  	    while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &r)) {
310  	        switch(r->mode) {
311  	
312  	            case throttle_extreme:
313  	                if(limit == 0 || limit > peers/4) {
314  	                    limit = QB_MAX(1, peers/4);
315  	                }
316  	                break;
317  	
318  	            case throttle_high:
319  	                if(limit == 0 || limit > peers/2) {
320  	                    limit = QB_MAX(1, peers/2);
321  	                }
322  	                break;
323  	            default:
324  	                break;
325  	        }
326  	    }
327  	    if(limit == l) {
328  	
329  	    } else if(l == 0) {
330  	        pcmk__trace("Using " PCMK_OPT_BATCH_LIMIT "=%d", limit);
331  	
332  	    } else {
333  	        pcmk__trace("Using " PCMK_OPT_BATCH_LIMIT "=%d instead of %d", limit,
334  	                    l);
335  	    }
336  	    return limit;
337  	}
338  	
339  	int
340  	throttle_get_job_limit(const char *node)
341  	{
342  	    int jobs = 1;
343  	    struct throttle_record_s *r = NULL;
344  	
345  	    r = g_hash_table_lookup(throttle_records, node);
346  	    if(r == NULL) {
347  	        r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s));
348  	        r->node = pcmk__str_copy(node);
349  	        r->mode = throttle_low;
350  	        r->max = throttle_job_max;
351  	        pcmk__trace("Defaulting to local values for unknown node %s", node);
352  	
353  	        g_hash_table_insert(throttle_records, r->node, r);
354  	    }
355  	
356  	    switch(r->mode) {
357  	        case throttle_extreme:
358  	        case throttle_high:
359  	            jobs = 1; /* At least one job must always be allowed */
360  	            break;
361  	        case throttle_med:
362  	            jobs = QB_MAX(1, r->max / 4);
363  	            break;
364  	        case throttle_low:
365  	            jobs = QB_MAX(1, r->max / 2);
366  	            break;
367  	        case throttle_none:
368  	            jobs = QB_MAX(1, r->max);
369  	            break;
370  	        default:
371  	            pcmk__err("Unknown throttle mode %.4x on %s", r->mode, node);
372  	            break;
373  	    }
374  	    return jobs;
375  	}
376  	
377  	void
378  	throttle_update(xmlNode *xml)
379  	{
380  	    int max = 0;
381  	    int mode = 0;
382  	    struct throttle_record_s *r = NULL;
383  	    const char *from = pcmk__xe_get(xml, PCMK__XA_SRC);
384  	
385  	    pcmk__xe_get_int(xml, PCMK__XA_CRM_LIMIT_MODE, &mode);
386  	    pcmk__xe_get_int(xml, PCMK__XA_CRM_LIMIT_MAX, &max);
387  	
388  	    r = g_hash_table_lookup(throttle_records, from);
389  	
390  	    if(r == NULL) {
391  	        r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s));
392  	        r->node = pcmk__str_copy(from);
393  	        g_hash_table_insert(throttle_records, r->node, r);
394  	    }
395  	
396  	    r->max = max;
397  	    r->mode = (enum throttle_state_e) mode;
398  	
399  	    pcmk__debug("Node %s has %s load and supports at most %d jobs; new job "
400  	                "limit %d",
401  	                from, load2str((enum throttle_state_e) mode), max,
402  	                throttle_get_job_limit(from));
403  	}
404