1    	/*
2    	 * Copyright 2004-2026 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU Lesser General Public License
7    	 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <stdbool.h>
13   	#include <sys/time.h>
14   	#include <sys/resource.h>
15   	
16   	#include <crm/crm.h>
17   	#include <crm/common/mainloop.h>
18   	#include <crm/common/xml.h>
19   	
20   	#include <crm/cluster/internal.h>
21   	#include <crm/cluster/election_internal.h>
22   	#include "crmcluster_private.h"
23   	
24   	#define STORM_INTERVAL   2      /* in seconds */
25   	
26   	struct pcmk__election {
27   	    enum election_result state;     // Current state of election
28   	    guint count;                    // How many times local node has voted
29   	    void (*cb)(pcmk_cluster_t *);   // Function to call if election is won
30   	    GHashTable *voted;  // Key = node name, value = how node voted
31   	    mainloop_timer_t *timeout; // When to abort if all votes not received
32   	    int election_wins;         // Track wins, for storm detection
33   	    bool wrote_blackbox;       // Write a storm blackbox at most once
34   	    time_t expires;            // When storm detection period ends
35   	    time_t last_election_loss; // When dampening period ends
36   	};
37   	
38   	static void
39   	election_complete(pcmk_cluster_t *cluster)
40   	{
41   	    pcmk__assert((cluster != NULL) && (cluster->priv->election != NULL));
42   	    cluster->priv->election->state = election_won;
43   	    if (cluster->priv->election->cb != NULL) {
44   	        cluster->priv->election->cb(cluster);
45   	    }
46   	    election_reset(cluster);
47   	}
48   	
49   	static gboolean
50   	election_timer_cb(gpointer user_data)
51   	{
52   	    pcmk_cluster_t *cluster = user_data;
53   	
54   	    pcmk__info("Declaring local node as winner after election timed out");
55   	    election_complete(cluster);
56   	    return FALSE;
57   	}
58   	
59   	/*!
60   	 * \internal
61   	 * \brief Get current state of an election
62   	 *
63   	 * \param[in] cluster  Cluster with election
64   	 *
65   	 * \return Current state of \e
66   	 */
67   	enum election_result
68   	election_state(const pcmk_cluster_t *cluster)
69   	{
70   	    if ((cluster == NULL) || (cluster->priv->election == NULL)) {
71   	        return election_error;
72   	    }
73   	    return cluster->priv->election->state;
74   	}
75   	
76   	/* The local node will be declared the winner if missing votes are not received
77   	 * within this time. The value is chosen to be the same as the default for the
78   	 * election-timeout cluster option.
79   	 */
80   	#define ELECTION_TIMEOUT_MS 120000
81   	
82   	/*!
83   	 * \internal
84   	 * \brief Track election state in a cluster
85   	 *
86   	 * Every node that wishes to participate in an election must initialize the
87   	 * election once, typically at start-up.
88   	 *
89   	 * \param[in] cluster    Cluster that election is for
90   	 * \param[in] cb         Function to call if local node wins election
91   	 */
92   	void
93   	election_init(pcmk_cluster_t *cluster, void (*cb)(pcmk_cluster_t *))
94   	{
95   	    const char *name = pcmk__s(crm_system_name, "election");
96   	
97   	    CRM_CHECK(cluster->priv->election == NULL, return);
98   	
CID (unavailable; MK=bf77ad6f79598a6146f5533ef1330eae) (#1 of 1): Resource not released (INCOMPLETE_DEALLOCATOR):
(1) Event allocation: Memory is allocated. [details]
(2) Event allocation: The field "cluster->priv->election" is allocated, but not released in the identified deallocator.
Also see events: [deallocator]
99   	    cluster->priv->election = pcmk__assert_alloc(1, sizeof(pcmk__election_t));
100  	    cluster->priv->election->cb = cb;
101  	    cluster->priv->election->timeout = mainloop_timer_add(name,
102  	                                                          ELECTION_TIMEOUT_MS,
103  	                                                          FALSE,
104  	                                                          election_timer_cb,
105  	                                                          cluster);
106  	}
107  	
108  	/*!
109  	 * \internal
110  	 * \brief Disregard any previous vote by specified peer
111  	 *
112  	 * This discards any recorded vote from a specified peer. Election users should
113  	 * call this whenever a voting peer becomes inactive.
114  	 *
115  	 * \param[in,out] cluster  Cluster with election
116  	 * \param[in]     uname    Name of peer to disregard
117  	 */
118  	void
119  	election_remove(pcmk_cluster_t *cluster, const char *uname)
120  	{
121  	    if ((cluster != NULL) && (cluster->priv->election != NULL)
122  	        && (uname != NULL) && (cluster->priv->election->voted != NULL)) {
123  	        pcmk__trace("Discarding (no-)vote from lost peer %s", uname);
124  	        g_hash_table_remove(cluster->priv->election->voted, uname);
125  	    }
126  	}
127  	
128  	/*!
129  	 * \internal
130  	 * \brief Stop election timer and disregard all votes
131  	 *
132  	 * \param[in,out] cluster  Cluster with election
133  	 */
134  	void
135  	election_reset(pcmk_cluster_t *cluster)
136  	{
137  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
138  	        pcmk__trace("Resetting election");
139  	        mainloop_timer_stop(cluster->priv->election->timeout);
140  	        g_clear_pointer(&cluster->priv->election->voted, g_hash_table_destroy);
141  	    }
142  	}
143  	
144  	/*!
145  	 * \internal
146  	 * \brief Free an election object
147  	 *
148  	 * Free all memory associated with an election object, stopping its
149  	 * election timer (if running).
150  	 *
151  	 * \param[in,out] cluster  Cluster with election
152  	 */
153  	void
(3) Event deallocator: Deallocator for "struct pcmk__cluster".
Also see events: [allocation][allocation]
154  	election_fini(pcmk_cluster_t *cluster)
155  	{
156  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
157  	        election_reset(cluster);
158  	        pcmk__trace("Destroying election");
159  	        mainloop_timer_del(cluster->priv->election->timeout);
160  	        g_clear_pointer(&cluster->priv->election, free);
161  	    }
162  	}
163  	
164  	static void
165  	election_timeout_start(pcmk_cluster_t *cluster)
166  	{
167  	    mainloop_timer_start(cluster->priv->election->timeout);
168  	}
169  	
170  	/*!
171  	 * \internal
172  	 * \brief Stop an election's timer, if running
173  	 *
174  	 * \param[in,out] cluster  Cluster with election
175  	 */
176  	void
177  	election_timeout_stop(pcmk_cluster_t *cluster)
178  	{
179  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
180  	        mainloop_timer_stop(cluster->priv->election->timeout);
181  	    }
182  	}
183  	
184  	/*!
185  	 * \internal
186  	 * \brief Change an election's timeout (restarting timer if running)
187  	 *
188  	 * \param[in,out] cluster  Cluster with election
189  	 * \param[in]     period   New timeout
190  	 */
191  	void
192  	election_timeout_set_period(pcmk_cluster_t *cluster, guint period)
193  	{
194  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
195  	    mainloop_timer_set_period(cluster->priv->election->timeout, period);
196  	}
197  	
198  	static int
199  	get_uptime(struct timeval *output)
200  	{
201  	    static time_t expires = 0;
202  	    static struct rusage info;
203  	
204  	    time_t tm_now = time(NULL);
205  	
206  	    if (expires < tm_now) {
207  	        int rc = 0;
208  	
209  	        output->tv_sec = 0;
210  	        output->tv_usec = 0;
211  	
212  	        info.ru_utime.tv_sec = 0;
213  	        info.ru_utime.tv_usec = 0;
214  	
215  	        rc = getrusage(RUSAGE_SELF, &info);
216  	        if (rc < 0) {
217  	            pcmk__err("Could not calculate the current uptime: %s",
218  	                      strerror(errno));
219  	            expires = 0;
220  	            return -1;
221  	        }
222  	
223  	        pcmk__debug("Current CPU usage is: %llds, %lldus",
224  	                    (long long) info.ru_utime.tv_sec,
225  	                    (long long) info.ru_utime.tv_usec);
226  	    }
227  	
228  	    expires = tm_now + STORM_INTERVAL;  /* N seconds after the last _access_ */
229  	    output->tv_sec = info.ru_utime.tv_sec;
230  	    output->tv_usec = info.ru_utime.tv_usec;
231  	
232  	    return 1;
233  	}
234  	
235  	static int
236  	compare_age(struct timeval your_age)
237  	{
238  	    struct timeval our_age;
239  	
240  	    get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
241  	
242  	    if (our_age.tv_sec > your_age.tv_sec) {
243  	        pcmk__debug("Win: %lld vs %lld (seconds)",
244  	                    (long long) our_age.tv_sec, (long long) your_age.tv_sec);
245  	        return 1;
246  	    } else if (our_age.tv_sec < your_age.tv_sec) {
247  	        pcmk__debug("Lose: %lld vs %lld (seconds)",
248  	                    (long long) our_age.tv_sec, (long long) your_age.tv_sec);
249  	        return -1;
250  	    } else if (our_age.tv_usec > your_age.tv_usec) {
251  	        pcmk__debug("Win: %lld.%06lld vs %lld.%06lld (usec)",
252  	                    (long long) our_age.tv_sec, (long long) our_age.tv_usec,
253  	                    (long long) your_age.tv_sec, (long long) your_age.tv_usec);
254  	        return 1;
255  	    } else if (our_age.tv_usec < your_age.tv_usec) {
256  	        pcmk__debug("Lose: %lld.%06lld vs %lld.%06lld (usec)",
257  	                    (long long) our_age.tv_sec, (long long) our_age.tv_usec,
258  	                    (long long) your_age.tv_sec, (long long) your_age.tv_usec);
259  	        return -1;
260  	    }
261  	
262  	    return 0;
263  	}
264  	
265  	/*!
266  	 * \internal
267  	 * \brief Start a new election by offering local node's candidacy
268  	 *
269  	 * Broadcast a "vote" election message containing the local node's ID,
270  	 * (incremented) election counter, and uptime, and start the election timer.
271  	 *
272  	 * \param[in,out] cluster  Cluster with election
273  	 *
274  	 * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if
275  	 *       all active peers do so, or if the election times out, the local node
276  	 *       wins the election. (If we lose to any peer vote, we will stop the
277  	 *       timer, so a timeout means we did not lose -- either some peer did not
278  	 *       vote, or we did not call election_check() in time.)
279  	 */
280  	void
281  	election_vote(pcmk_cluster_t *cluster)
282  	{
283  	    struct timeval age;
284  	    xmlNode *vote = NULL;
285  	    pcmk__node_status_t *our_node = NULL;
286  	    const char *message_type = NULL;
287  	
288  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
289  	
290  	    if (cluster->priv->node_name == NULL) {
291  	        pcmk__err("Cannot start an election: Local node name unknown");
292  	        return;
293  	    }
294  	
295  	    our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
296  	                              pcmk__node_search_cluster_member);
297  	    if (!pcmk__cluster_is_node_active(our_node)) {
298  	        pcmk__trace("Cannot vote yet: local node not connected to cluster");
299  	        return;
300  	    }
301  	
302  	    election_reset(cluster);
303  	    cluster->priv->election->state = election_in_progress;
304  	    message_type = pcmk__server_message_type(cluster->priv->server);
305  	
306  	    /* @COMPAT We use message_type as the sender and recipient system for
307  	     * backward compatibility (see T566).
308  	     */
309  	    vote = pcmk__new_request(cluster->priv->server, message_type,
310  	                             NULL, message_type, CRM_OP_VOTE, NULL);
311  	
312  	    cluster->priv->election->count++;
313  	    pcmk__xe_set(vote, PCMK__XA_ELECTION_OWNER,
314  	                 pcmk__cluster_get_xml_id(our_node));
315  	    pcmk__xe_set_int(vote, PCMK__XA_ELECTION_ID,
316  	                     cluster->priv->election->count);
317  	
318  	    // Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is actually microseconds
319  	    get_uptime(&age);
320  	    pcmk__xe_set_timeval(vote, PCMK__XA_ELECTION_AGE_SEC,
321  	                         PCMK__XA_ELECTION_AGE_NANO_SEC, &age);
322  	
323  	    pcmk__cluster_send_message(NULL, cluster->priv->server, vote);
324  	    pcmk__xml_free(vote);
325  	
326  	    pcmk__debug("Started election round %u", cluster->priv->election->count);
327  	    election_timeout_start(cluster);
328  	}
329  	
330  	/*!
331  	 * \internal
332  	 * \brief Check whether local node has won an election
333  	 *
334  	 * If all known peers have sent no-vote messages, stop the election timer, set
335  	 * the election state to won, and call any registered win callback.
336  	 *
337  	 * \param[in,out] cluster  Cluster with election
338  	 *
339  	 * \return TRUE if local node has won, FALSE otherwise
340  	 * \note If all known peers have sent no-vote messages, but the election owner
341  	 *       does not call this function, the election will not be won (and the
342  	 *       callback will not be called) until the election times out.
343  	 * \note This should be called when election_count_vote() returns
344  	 *       \c election_in_progress.
345  	 */
346  	bool
347  	election_check(pcmk_cluster_t *cluster)
348  	{
349  	    int voted_size = 0;
350  	    int num_members = 0;
351  	
352  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL),
353  	              return false);
354  	
355  	    if (cluster->priv->election->voted == NULL) {
356  	        pcmk__trace("Election check requested, but no votes received yet");
357  	        return FALSE;
358  	    }
359  	
360  	    voted_size = g_hash_table_size(cluster->priv->election->voted);
361  	    num_members = pcmk__cluster_num_active_nodes();
362  	
363  	    /* in the case of #voted > #members, it is better to
364  	     *   wait for the timeout and give the cluster time to
365  	     *   stabilize
366  	     */
367  	    if (voted_size >= num_members) {
368  	        /* we won and everyone has voted */
369  	        election_timeout_stop(cluster);
370  	        if (voted_size > num_members) {
371  	            GHashTableIter gIter;
372  	            const pcmk__node_status_t *node = NULL;
373  	            char *key = NULL;
374  	
375  	            pcmk__warn("Received too many votes in election");
376  	            g_hash_table_iter_init(&gIter, pcmk__peer_cache);
377  	            while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
378  	                if (pcmk__cluster_is_node_active(node)) {
379  	                    pcmk__warn("* expected vote: %s", node->name);
380  	                }
381  	            }
382  	
383  	            g_hash_table_iter_init(&gIter, cluster->priv->election->voted);
384  	            while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
385  	                pcmk__warn("* actual vote: %s", key);
386  	            }
387  	
388  	        }
389  	
390  	        pcmk__info("Election won by local node");
391  	        election_complete(cluster);
392  	        return TRUE;
393  	
394  	    } else {
395  	        pcmk__debug("Election still waiting on %d of %d vote%s",
396  	                    (num_members - voted_size), num_members,
397  	                    pcmk__plural_s(num_members));
398  	    }
399  	
400  	    return FALSE;
401  	}
402  	
403  	#define LOSS_DAMPEN 2           /* in seconds */
404  	
405  	struct vote {
406  	    const char *op;
407  	    const char *from;
408  	    const char *version;
409  	    const char *election_owner;
410  	    int election_id;
411  	    struct timeval age;
412  	};
413  	
414  	/*!
415  	 * \internal
416  	 * \brief Unpack an election message
417  	 *
418  	 * \param[in] message  Election message XML
419  	 * \param[out] vote    Parsed fields from message
420  	 *
421  	 * \return TRUE if election message and election are valid, FALSE otherwise
422  	 * \note The parsed struct's pointer members are valid only for the lifetime of
423  	 *       the message argument.
424  	 */
425  	static bool
426  	parse_election_message(const xmlNode *message, struct vote *vote)
427  	{
428  	    CRM_CHECK(message && vote, return FALSE);
429  	
430  	    vote->election_id = -1;
431  	    vote->age.tv_sec = -1;
432  	    vote->age.tv_usec = -1;
433  	
434  	    vote->op = pcmk__xe_get(message, PCMK__XA_CRM_TASK);
435  	    vote->from = pcmk__xe_get(message, PCMK__XA_SRC);
436  	    vote->version = pcmk__xe_get(message, PCMK_XA_VERSION);
437  	    vote->election_owner = pcmk__xe_get(message, PCMK__XA_ELECTION_OWNER);
438  	
439  	    pcmk__xe_get_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
440  	
441  	    if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
442  	        || (vote->election_owner == NULL) || (vote->election_id < 0)) {
443  	
444  	        pcmk__warn("Invalid %s message from %s", pcmk__s(vote->op, "election"),
445  	                   pcmk__s(vote->from, "unspecified node"));
446  	        pcmk__log_xml_trace(message, "bad-vote");
447  	        return FALSE;
448  	    }
449  	
450  	    // Op-specific validation
451  	
452  	    if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
453  	        /* Only vote ops have uptime.
454  	           Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is in microseconds.
455  	         */
456  	        if ((pcmk__xe_get_timeval(message, PCMK__XA_ELECTION_AGE_SEC,
457  	                                  PCMK__XA_ELECTION_AGE_NANO_SEC,
458  	                                  &(vote->age)) != pcmk_rc_ok)
459  	            || (vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
460  	
461  	            pcmk__warn("Cannot count election %s from %s because uptime is "
462  	                       "missing or invalid",
463  	                       vote->op, vote->from);
464  	            return FALSE;
465  	        }
466  	
467  	    } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
468  	        pcmk__info("Cannot process election message from %s because %s is not "
469  	                   "a known election op",
470  	                   vote->from, vote->op);
471  	        return FALSE;
472  	    }
473  	
474  	    /* If the membership cache is NULL, we REALLY shouldn't be voting --
475  	     * the question is how we managed to get here.
476  	     */
477  	    if (pcmk__peer_cache == NULL) {
478  	        pcmk__info("Cannot count election %s from %s becasue no peer "
479  	                   "information available",
480  	                   vote->op, vote->from);
481  	        return FALSE;
482  	    }
483  	    return TRUE;
484  	}
485  	
486  	static void
487  	record_vote(pcmk_cluster_t *cluster, struct vote *vote)
488  	{
489  	    pcmk__assert((vote->from != NULL) && (vote->op != NULL));
490  	
491  	    if (cluster->priv->election->voted == NULL) {
492  	        cluster->priv->election->voted = pcmk__strkey_table(free, free);
493  	    }
494  	    pcmk__insert_dup(cluster->priv->election->voted, vote->from, vote->op);
495  	}
496  	
497  	static void
498  	send_no_vote(pcmk_cluster_t *cluster, pcmk__node_status_t *peer,
499  	             struct vote *vote)
500  	{
501  	    const char *message_type = NULL;
502  	    xmlNode *novote = NULL;
503  	
504  	    message_type = pcmk__server_message_type(cluster->priv->server);
505  	    novote = pcmk__new_request(cluster->priv->server, message_type,
506  	                               vote->from, message_type, CRM_OP_NOVOTE, NULL);
507  	    pcmk__xe_set(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner);
508  	    pcmk__xe_set_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
509  	
510  	    pcmk__cluster_send_message(peer, cluster->priv->server, novote);
511  	    pcmk__xml_free(novote);
512  	}
513  	
514  	/*!
515  	 * \internal
516  	 * \brief Process an election message (vote or no-vote) from a peer
517  	 *
518  	 * \param[in,out] cluster  Cluster with election
519  	 * \param[in]     message  Election message XML from peer
520  	 * \param[in]     can_win  Whether local node is eligible to win
521  	 *
522  	 * \return Election state after new vote is considered
523  	 * \note If the peer message is a vote, and we prefer the peer to win, this will
524  	 *       send a no-vote reply to the peer.
525  	 * \note The situations "we lost to this vote" from "this is a late no-vote
526  	 *       after we've already lost" both return election_lost. If a caller needs
527  	 *       to distinguish them, it should save the current state before calling
528  	 *       this function, and then compare the result.
529  	 */
530  	enum election_result
531  	election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message,
532  	                    bool can_win)
533  	{
534  	    int log_level = LOG_INFO;
535  	    gboolean done = FALSE;
536  	    gboolean we_lose = FALSE;
537  	    const char *reason = NULL;
538  	    bool we_are_owner = FALSE;
539  	    pcmk__node_status_t *our_node = NULL;
540  	    pcmk__node_status_t *your_node = NULL;
541  	    time_t tm_now = time(NULL);
542  	    struct vote vote;
543  	
544  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL)
545  	              && (message != NULL) && (cluster->priv->node_name != NULL),
546  	              return election_error);
547  	
548  	    if (!parse_election_message(message, &vote)) {
549  	        return election_error;
550  	    }
551  	
552  	    your_node = pcmk__get_node(0, vote.from, NULL,
553  	                               pcmk__node_search_cluster_member);
554  	    our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
555  	                              pcmk__node_search_cluster_member);
556  	    we_are_owner = (our_node != NULL)
557  	                   && pcmk__str_eq(pcmk__cluster_get_xml_id(our_node),
558  	                                   vote.election_owner, pcmk__str_none);
559  	
560  	    if (!can_win) {
561  	        reason = "Not eligible";
562  	        we_lose = TRUE;
563  	
564  	    } else if (!pcmk__cluster_is_node_active(our_node)) {
565  	        reason = "We are not part of the cluster";
566  	        log_level = LOG_ERR;
567  	        we_lose = TRUE;
568  	
569  	    } else if (we_are_owner
570  	               && (vote.election_id != cluster->priv->election->count)) {
571  	        log_level = LOG_TRACE;
572  	        reason = "Superseded";
573  	        done = TRUE;
574  	
575  	    } else if (!pcmk__cluster_is_node_active(your_node)) {
576  	        /* Possibly we cached the message in the FSA queue at a point that it wasn't */
577  	        reason = "Peer is not part of our cluster";
578  	        log_level = LOG_WARNING;
579  	        done = TRUE;
580  	
581  	    } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
582  	               || pcmk__str_eq(vote.from, cluster->priv->node_name,
583  	                               pcmk__str_casei)) {
584  	        /* Receiving our own broadcast vote, or a no-vote from peer, is a vote
585  	         * for us to win
586  	         */
587  	        if (!we_are_owner) {
588  	            pcmk__warn("Cannot count election round %d %s from %s because we "
589  	                       "did not start election (node ID %s did)",
590  	                       vote.election_id, vote.op, vote.from,
591  	                       vote.election_owner);
592  	            return election_error;
593  	        }
594  	        if (cluster->priv->election->state != election_in_progress) {
595  	            // Should only happen if we already lost
596  	            pcmk__debug("Not counting election round %d %s from %s because no "
597  	                        "election in progress",
598  	                        vote.election_id, vote.op, vote.from);
599  	            return cluster->priv->election->state;
600  	        }
601  	        record_vote(cluster, &vote);
602  	        reason = "Recorded";
603  	        done = TRUE;
604  	
605  	    } else {
606  	        // A peer vote requires a comparison to determine which node is better
607  	        int age_result = compare_age(vote.age);
608  	        int version_result = pcmk__compare_versions(vote.version,
609  	                                                    CRM_FEATURE_SET);
610  	
611  	        if (version_result < 0) {
612  	            reason = "Version";
613  	            we_lose = TRUE;
614  	
615  	        } else if (version_result > 0) {
616  	            reason = "Version";
617  	
618  	        } else if (age_result < 0) {
619  	            reason = "Uptime";
620  	            we_lose = TRUE;
621  	
622  	        } else if (age_result > 0) {
623  	            reason = "Uptime";
624  	
625  	        } else if (strcasecmp(cluster->priv->node_name, vote.from) > 0) {
626  	            reason = "Host name";
627  	            we_lose = TRUE;
628  	
629  	        } else {
630  	            reason = "Host name";
631  	        }
632  	    }
633  	
634  	    if (cluster->priv->election->expires < tm_now) {
635  	        cluster->priv->election->election_wins = 0;
636  	        cluster->priv->election->expires = tm_now + STORM_INTERVAL;
637  	
638  	    } else if (done == FALSE && we_lose == FALSE) {
639  	        int peers = 1 + g_hash_table_size(pcmk__peer_cache);
640  	
641  	        /* If every node has to vote down every other node, thats N*(N-1) total elections
642  	         * Allow some leeway before _really_ complaining
643  	         */
644  	        cluster->priv->election->election_wins++;
645  	        if (cluster->priv->election->election_wins > (peers * peers)) {
646  	            pcmk__warn("Election storm detected: %d wins in %d seconds",
647  	                       cluster->priv->election->election_wins, STORM_INTERVAL);
648  	            cluster->priv->election->election_wins = 0;
649  	            cluster->priv->election->expires = tm_now + STORM_INTERVAL;
650  	            if (!(cluster->priv->election->wrote_blackbox)) {
651  	                /* It's questionable whether a black box (from every node in the
652  	                 * cluster) would be truly helpful in diagnosing an election
653  	                 * storm. It's also highly doubtful a production environment
654  	                 * would get multiple election storms from distinct causes, so
655  	                 * saving one blackbox per process lifetime should be
656  	                 * sufficient. Alternatives would be to save a timestamp of the
657  	                 * last blackbox write instead of a boolean, and write a new one
658  	                 * if some amount of time has passed; or to save a storm count,
659  	                 * write a blackbox on every Nth occurrence.
660  	                 */
661  	                crm_write_blackbox(0, NULL);
662  	                cluster->priv->election->wrote_blackbox = true;
663  	            }
664  	        }
665  	    }
666  	
667  	    if (done) {
668  	        do_crm_log(log_level + 1,
669  	                   "Processed election round %u %s (current round %d) "
670  	                   "from %s (%s)",
671  	                   vote.election_id, vote.op, cluster->priv->election->count,
672  	                   vote.from, reason);
673  	        return cluster->priv->election->state;
674  	
675  	    } else if (we_lose == FALSE) {
676  	        /* We track the time of the last election loss to implement an election
677  	         * dampening period, reducing the likelihood of an election storm. If
678  	         * this node has lost within the dampening period, don't start a new
679  	         * election, even if we win against a peer's vote -- the peer we lost to
680  	         * should win again.
681  	         *
682  	         * @TODO This has a problem case: if an election winner immediately
683  	         * leaves the cluster, and a new election is immediately called, all
684  	         * nodes could lose, with no new winner elected. The ideal solution
685  	         * would be to tie the election structure with the peer caches, which
686  	         * would allow us to clear the dampening when the previous winner
687  	         * leaves (and would allow other improvements as well).
688  	         */
689  	        if ((cluster->priv->election->last_election_loss == 0)
690  	            || ((tm_now - cluster->priv->election->last_election_loss)
691  	                > (time_t) LOSS_DAMPEN)) {
692  	
693  	            do_crm_log(log_level,
694  	                       "Election round %d (started by node ID %s) pass: "
695  	                       "%s from %s (%s)",
696  	                       vote.election_id, vote.election_owner, vote.op,
697  	                       vote.from, reason);
698  	
699  	            cluster->priv->election->last_election_loss = 0;
700  	            election_timeout_stop(cluster);
701  	
702  	            /* Start a new election by voting down this, and other, peers */
703  	            cluster->priv->election->state = election_start;
704  	            return cluster->priv->election->state;
705  	        } else {
706  	            char *loss_time = NULL;
707  	
708  	            loss_time = ctime(&(cluster->priv->election->last_election_loss));
709  	            if (loss_time) {
710  	                // Show only HH:MM:SS
711  	                loss_time += 11;
712  	                loss_time[8] = '\0';
713  	            }
714  	            pcmk__info("Ignoring election round %d (started by node ID %s) "
715  	                       "pass vs %s because we lost less than %ds ago at %s",
716  	                       vote.election_id, vote.election_owner, vote.from,
717  	                       LOSS_DAMPEN, pcmk__s(loss_time, "unknown"));
718  	        }
719  	    }
720  	
721  	    cluster->priv->election->last_election_loss = tm_now;
722  	
723  	    do_crm_log(log_level,
724  	               "Election round %d (started by node ID %s) lost: "
725  	               "%s from %s (%s)",
726  	               vote.election_id, vote.election_owner, vote.op,
727  	               vote.from, reason);
728  	
729  	    election_reset(cluster);
730  	    send_no_vote(cluster, your_node, &vote);
731  	    cluster->priv->election->state = election_lost;
732  	    return cluster->priv->election->state;
733  	}
734  	
735  	/*!
736  	 * \internal
737  	 * \brief Reset any election dampening currently in effect
738  	 *
739  	 * \param[in,out] cluster  Cluster with election
740  	 */
741  	void
742  	election_clear_dampening(pcmk_cluster_t *cluster)
743  	{
744  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
745  	        cluster->priv->election->last_election_loss = 0;
746  	    }
747  	}
748