1    	/*
2    	 * Copyright 2004-2024 the Pacemaker project contributors
3    	 *
4    	 * The version control history for this file may have further details.
5    	 *
6    	 * This source code is licensed under the GNU Lesser General Public License
7    	 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8    	 */
9    	
10   	#include <crm_internal.h>
11   	
12   	#include <sys/time.h>
13   	#include <sys/resource.h>
14   	
15   	#include <crm/crm.h>
16   	#include <crm/common/mainloop.h>
17   	#include <crm/common/xml.h>
18   	
19   	#include <crm/cluster/internal.h>
20   	#include <crm/cluster/election_internal.h>
21   	#include "crmcluster_private.h"
22   	
23   	#define STORM_INTERVAL   2      /* in seconds */
24   	
25   	struct pcmk__election {
26   	    enum election_result state;     // Current state of election
27   	    guint count;                    // How many times local node has voted
28   	    void (*cb)(pcmk_cluster_t *);   // Function to call if election is won
29   	    GHashTable *voted;  // Key = node name, value = how node voted
30   	    mainloop_timer_t *timeout; // When to abort if all votes not received
31   	    int election_wins;         // Track wins, for storm detection
32   	    bool wrote_blackbox;       // Write a storm blackbox at most once
33   	    time_t expires;            // When storm detection period ends
34   	    time_t last_election_loss; // When dampening period ends
35   	};
36   	
37   	static void
38   	election_complete(pcmk_cluster_t *cluster)
39   	{
40   	    pcmk__assert((cluster != NULL) && (cluster->priv->election != NULL));
41   	    cluster->priv->election->state = election_won;
42   	    if (cluster->priv->election->cb != NULL) {
43   	        cluster->priv->election->cb(cluster);
44   	    }
45   	    election_reset(cluster);
46   	}
47   	
48   	static gboolean
49   	election_timer_cb(gpointer user_data)
50   	{
51   	    pcmk_cluster_t *cluster = user_data;
52   	
53   	    crm_info("Declaring local node as winner after election timed out");
54   	    election_complete(cluster);
55   	    return FALSE;
56   	}
57   	
58   	/*!
59   	 * \internal
60   	 * \brief Get current state of an election
61   	 *
62   	 * \param[in] cluster  Cluster with election
63   	 *
64   	 * \return Current state of \e
65   	 */
66   	enum election_result
67   	election_state(const pcmk_cluster_t *cluster)
68   	{
69   	    if ((cluster == NULL) || (cluster->priv->election == NULL)) {
70   	        return election_error;
71   	    }
72   	    return cluster->priv->election->state;
73   	}
74   	
75   	/* The local node will be declared the winner if missing votes are not received
76   	 * within this time. The value is chosen to be the same as the default for the
77   	 * election-timeout cluster option.
78   	 */
79   	#define ELECTION_TIMEOUT_MS 120000
80   	
81   	/*!
82   	 * \internal
83   	 * \brief Track election state in a cluster
84   	 *
85   	 * Every node that wishes to participate in an election must initialize the
86   	 * election once, typically at start-up.
87   	 *
88   	 * \param[in] cluster    Cluster that election is for
89   	 * \param[in] cb         Function to call if local node wins election
90   	 */
91   	void
92   	election_init(pcmk_cluster_t *cluster, void (*cb)(pcmk_cluster_t *))
93   	{
94   	    const char *name = pcmk__s(crm_system_name, "election");
95   	
96   	    CRM_CHECK(cluster->priv->election == NULL, return);
97   	
CID (unavailable; MK=bf77ad6f79598a6146f5533ef1330eae) (#1 of 1): Resource not released (INCOMPLETE_DEALLOCATOR):
(1) Event allocation: Memory is allocated. [details]
(2) Event allocation: The field "cluster->priv->election" is allocated, but not released in the identified deallocator.
Also see events: [deallocator]
98   	    cluster->priv->election = pcmk__assert_alloc(1, sizeof(pcmk__election_t));
99   	    cluster->priv->election->cb = cb;
100  	    cluster->priv->election->timeout = mainloop_timer_add(name,
101  	                                                          ELECTION_TIMEOUT_MS,
102  	                                                          FALSE,
103  	                                                          election_timer_cb,
104  	                                                          cluster);
105  	}
106  	
107  	/*!
108  	 * \internal
109  	 * \brief Disregard any previous vote by specified peer
110  	 *
111  	 * This discards any recorded vote from a specified peer. Election users should
112  	 * call this whenever a voting peer becomes inactive.
113  	 *
114  	 * \param[in,out] cluster  Cluster with election
115  	 * \param[in]     uname    Name of peer to disregard
116  	 */
117  	void
118  	election_remove(pcmk_cluster_t *cluster, const char *uname)
119  	{
120  	    if ((cluster != NULL) && (cluster->priv->election != NULL)
121  	        && (uname != NULL) && (cluster->priv->election->voted != NULL)) {
122  	        crm_trace("Discarding (no-)vote from lost peer %s", uname);
123  	        g_hash_table_remove(cluster->priv->election->voted, uname);
124  	    }
125  	}
126  	
127  	/*!
128  	 * \internal
129  	 * \brief Stop election timer and disregard all votes
130  	 *
131  	 * \param[in,out] cluster  Cluster with election
132  	 */
133  	void
134  	election_reset(pcmk_cluster_t *cluster)
135  	{
136  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
137  	        crm_trace("Resetting election");
138  	        mainloop_timer_stop(cluster->priv->election->timeout);
139  	        if (cluster->priv->election->voted != NULL) {
140  	            g_hash_table_destroy(cluster->priv->election->voted);
141  	            cluster->priv->election->voted = NULL;
142  	        }
143  	    }
144  	}
145  	
146  	/*!
147  	 * \internal
148  	 * \brief Free an election object
149  	 *
150  	 * Free all memory associated with an election object, stopping its
151  	 * election timer (if running).
152  	 *
153  	 * \param[in,out] cluster  Cluster with election
154  	 */
155  	void
156  	election_fini(pcmk_cluster_t *cluster)
157  	{
158  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
159  	        election_reset(cluster);
160  	        crm_trace("Destroying election");
161  	        mainloop_timer_del(cluster->priv->election->timeout);
162  	        free(cluster->priv->election);
163  	        cluster->priv->election = NULL;
164  	    }
165  	}
166  	
167  	static void
168  	election_timeout_start(pcmk_cluster_t *cluster)
169  	{
170  	    mainloop_timer_start(cluster->priv->election->timeout);
171  	}
172  	
173  	/*!
174  	 * \internal
175  	 * \brief Stop an election's timer, if running
176  	 *
177  	 * \param[in,out] cluster  Cluster with election
178  	 */
179  	void
180  	election_timeout_stop(pcmk_cluster_t *cluster)
181  	{
182  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
183  	        mainloop_timer_stop(cluster->priv->election->timeout);
184  	    }
185  	}
186  	
187  	/*!
188  	 * \internal
189  	 * \brief Change an election's timeout (restarting timer if running)
190  	 *
191  	 * \param[in,out] cluster  Cluster with election
192  	 * \param[in]     period   New timeout
193  	 */
194  	void
195  	election_timeout_set_period(pcmk_cluster_t *cluster, guint period)
196  	{
197  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
198  	    mainloop_timer_set_period(cluster->priv->election->timeout, period);
199  	}
200  	
201  	static int
202  	get_uptime(struct timeval *output)
203  	{
204  	    static time_t expires = 0;
205  	    static struct rusage info;
206  	
207  	    time_t tm_now = time(NULL);
208  	
209  	    if (expires < tm_now) {
210  	        int rc = 0;
211  	
212  	        info.ru_utime.tv_sec = 0;
213  	        info.ru_utime.tv_usec = 0;
214  	        rc = getrusage(RUSAGE_SELF, &info);
215  	
216  	        output->tv_sec = 0;
217  	        output->tv_usec = 0;
218  	
219  	        if (rc < 0) {
220  	            crm_perror(LOG_ERR, "Could not calculate the current uptime");
221  	            expires = 0;
222  	            return -1;
223  	        }
224  	
225  	        crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
226  	                  (long)info.ru_utime.tv_usec);
227  	    }
228  	
229  	    expires = tm_now + STORM_INTERVAL;  /* N seconds after the last _access_ */
230  	    output->tv_sec = info.ru_utime.tv_sec;
231  	    output->tv_usec = info.ru_utime.tv_usec;
232  	
233  	    return 1;
234  	}
235  	
236  	static int
237  	compare_age(struct timeval your_age)
238  	{
239  	    struct timeval our_age;
240  	
241  	    get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
242  	
243  	    if (our_age.tv_sec > your_age.tv_sec) {
244  	        crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
245  	        return 1;
246  	    } else if (our_age.tv_sec < your_age.tv_sec) {
247  	        crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
248  	        return -1;
249  	    } else if (our_age.tv_usec > your_age.tv_usec) {
250  	        crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
251  	                  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
252  	        return 1;
253  	    } else if (our_age.tv_usec < your_age.tv_usec) {
254  	        crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
255  	                  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
256  	        return -1;
257  	    }
258  	
259  	    return 0;
260  	}
261  	
262  	/*!
263  	 * \internal
264  	 * \brief Start a new election by offering local node's candidacy
265  	 *
266  	 * Broadcast a "vote" election message containing the local node's ID,
267  	 * (incremented) election counter, and uptime, and start the election timer.
268  	 *
269  	 * \param[in,out] cluster  Cluster with election
270  	 *
271  	 * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if
272  	 *       all active peers do so, or if the election times out, the local node
273  	 *       wins the election. (If we lose to any peer vote, we will stop the
274  	 *       timer, so a timeout means we did not lose -- either some peer did not
275  	 *       vote, or we did not call election_check() in time.)
276  	 */
277  	void
278  	election_vote(pcmk_cluster_t *cluster)
279  	{
280  	    struct timeval age;
281  	    xmlNode *vote = NULL;
282  	    pcmk__node_status_t *our_node = NULL;
283  	    const char *message_type = NULL;
284  	
285  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
286  	
287  	    if (cluster->priv->node_name == NULL) {
288  	        crm_err("Cannot start an election: Local node name unknown");
289  	        return;
290  	    }
291  	
292  	    our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
293  	                              pcmk__node_search_cluster_member);
294  	    if (!pcmk__cluster_is_node_active(our_node)) {
295  	        crm_trace("Cannot vote yet: local node not connected to cluster");
296  	        return;
297  	    }
298  	
299  	    election_reset(cluster);
300  	    cluster->priv->election->state = election_in_progress;
301  	    message_type = pcmk__server_message_type(cluster->priv->server);
302  	
303  	    /* @COMPAT We use message_type as the sender and recipient system for
304  	     * backward compatibility (see T566).
305  	     */
306  	    vote = pcmk__new_request(cluster->priv->server, message_type,
307  	                             NULL, message_type, CRM_OP_VOTE, NULL);
308  	
309  	    cluster->priv->election->count++;
310  	    crm_xml_add(vote, PCMK__XA_ELECTION_OWNER,
311  	                pcmk__cluster_get_xml_id(our_node));
312  	    crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, cluster->priv->election->count);
313  	
314  	    // Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is actually microseconds
315  	    get_uptime(&age);
316  	    crm_xml_add_timeval(vote, PCMK__XA_ELECTION_AGE_SEC,
317  	                        PCMK__XA_ELECTION_AGE_NANO_SEC, &age);
318  	
319  	    pcmk__cluster_send_message(NULL, cluster->priv->server, vote);
320  	    pcmk__xml_free(vote);
321  	
322  	    crm_debug("Started election round %u", cluster->priv->election->count);
323  	    election_timeout_start(cluster);
324  	    return;
325  	}
326  	
327  	/*!
328  	 * \internal
329  	 * \brief Check whether local node has won an election
330  	 *
331  	 * If all known peers have sent no-vote messages, stop the election timer, set
332  	 * the election state to won, and call any registered win callback.
333  	 *
334  	 * \param[in,out] cluster  Cluster with election
335  	 *
336  	 * \return TRUE if local node has won, FALSE otherwise
337  	 * \note If all known peers have sent no-vote messages, but the election owner
338  	 *       does not call this function, the election will not be won (and the
339  	 *       callback will not be called) until the election times out.
340  	 * \note This should be called when election_count_vote() returns
341  	 *       \c election_in_progress.
342  	 */
343  	bool
344  	election_check(pcmk_cluster_t *cluster)
345  	{
346  	    int voted_size = 0;
347  	    int num_members = 0;
348  	
349  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL),
350  	              return false);
351  	
352  	    if (cluster->priv->election->voted == NULL) {
353  	        crm_trace("Election check requested, but no votes received yet");
354  	        return FALSE;
355  	    }
356  	
357  	    voted_size = g_hash_table_size(cluster->priv->election->voted);
358  	    num_members = pcmk__cluster_num_active_nodes();
359  	
360  	    /* in the case of #voted > #members, it is better to
361  	     *   wait for the timeout and give the cluster time to
362  	     *   stabilize
363  	     */
364  	    if (voted_size >= num_members) {
365  	        /* we won and everyone has voted */
366  	        election_timeout_stop(cluster);
367  	        if (voted_size > num_members) {
368  	            GHashTableIter gIter;
369  	            const pcmk__node_status_t *node = NULL;
370  	            char *key = NULL;
371  	
372  	            crm_warn("Received too many votes in election");
373  	            g_hash_table_iter_init(&gIter, pcmk__peer_cache);
374  	            while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
375  	                if (pcmk__cluster_is_node_active(node)) {
376  	                    crm_warn("* expected vote: %s", node->name);
377  	                }
378  	            }
379  	
380  	            g_hash_table_iter_init(&gIter, cluster->priv->election->voted);
381  	            while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
382  	                crm_warn("* actual vote: %s", key);
383  	            }
384  	
385  	        }
386  	
387  	        crm_info("Election won by local node");
388  	        election_complete(cluster);
389  	        return TRUE;
390  	
391  	    } else {
392  	        crm_debug("Election still waiting on %d of %d vote%s",
393  	                  num_members - voted_size, num_members,
394  	                  pcmk__plural_s(num_members));
395  	    }
396  	
397  	    return FALSE;
398  	}
399  	
400  	#define LOSS_DAMPEN 2           /* in seconds */
401  	
402  	struct vote {
403  	    const char *op;
404  	    const char *from;
405  	    const char *version;
406  	    const char *election_owner;
407  	    int election_id;
408  	    struct timeval age;
409  	};
410  	
411  	/*!
412  	 * \internal
413  	 * \brief Unpack an election message
414  	 *
415  	 * \param[in] message  Election message XML
416  	 * \param[out] vote    Parsed fields from message
417  	 *
418  	 * \return TRUE if election message and election are valid, FALSE otherwise
419  	 * \note The parsed struct's pointer members are valid only for the lifetime of
420  	 *       the message argument.
421  	 */
422  	static bool
423  	parse_election_message(const xmlNode *message, struct vote *vote)
424  	{
425  	    CRM_CHECK(message && vote, return FALSE);
426  	
427  	    vote->election_id = -1;
428  	    vote->age.tv_sec = -1;
429  	    vote->age.tv_usec = -1;
430  	
431  	    vote->op = crm_element_value(message, PCMK__XA_CRM_TASK);
432  	    vote->from = crm_element_value(message, PCMK__XA_SRC);
433  	    vote->version = crm_element_value(message, PCMK_XA_VERSION);
434  	    vote->election_owner = crm_element_value(message, PCMK__XA_ELECTION_OWNER);
435  	
436  	    crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
437  	
438  	    if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
439  	        || (vote->election_owner == NULL) || (vote->election_id < 0)) {
440  	
441  	        crm_warn("Invalid %s message from %s",
442  	                 pcmk__s(vote->op, "election"),
443  	                 pcmk__s(vote->from, "unspecified node"));
444  	        crm_log_xml_trace(message, "bad-vote");
445  	        return FALSE;
446  	    }
447  	
448  	    // Op-specific validation
449  	
450  	    if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
451  	        /* Only vote ops have uptime.
452  	           Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is in microseconds.
453  	         */
454  	        crm_element_value_timeval(message, PCMK__XA_ELECTION_AGE_SEC,
455  	                                  PCMK__XA_ELECTION_AGE_NANO_SEC, &(vote->age));
456  	        if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
457  	            crm_warn("Cannot count election %s from %s "
458  	                     "because it is missing uptime", vote->op, vote->from);
459  	            return FALSE;
460  	        }
461  	
462  	    } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
463  	        crm_info("Cannot process election message from %s "
464  	                 "because %s is not a known election op", vote->from, vote->op);
465  	        return FALSE;
466  	    }
467  	
468  	    /* If the membership cache is NULL, we REALLY shouldn't be voting --
469  	     * the question is how we managed to get here.
470  	     */
471  	    if (pcmk__peer_cache == NULL) {
472  	        crm_info("Cannot count election %s from %s "
473  	                 "because no peer information available", vote->op, vote->from);
474  	        return FALSE;
475  	    }
476  	    return TRUE;
477  	}
478  	
479  	static void
480  	record_vote(pcmk_cluster_t *cluster, struct vote *vote)
481  	{
482  	    pcmk__assert((vote->from != NULL) && (vote->op != NULL));
483  	
484  	    if (cluster->priv->election->voted == NULL) {
485  	        cluster->priv->election->voted = pcmk__strkey_table(free, free);
486  	    }
487  	    pcmk__insert_dup(cluster->priv->election->voted, vote->from, vote->op);
488  	}
489  	
490  	static void
491  	send_no_vote(pcmk_cluster_t *cluster, pcmk__node_status_t *peer,
492  	             struct vote *vote)
493  	{
494  	    const char *message_type = NULL;
495  	    xmlNode *novote = NULL;
496  	
497  	    message_type = pcmk__server_message_type(cluster->priv->server);
498  	    novote = pcmk__new_request(cluster->priv->server, message_type,
499  	                               vote->from, message_type, CRM_OP_NOVOTE, NULL);
500  	    crm_xml_add(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner);
501  	    crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
502  	
503  	    pcmk__cluster_send_message(peer, cluster->priv->server, novote);
504  	    pcmk__xml_free(novote);
505  	}
506  	
507  	/*!
508  	 * \internal
509  	 * \brief Process an election message (vote or no-vote) from a peer
510  	 *
511  	 * \param[in,out] cluster  Cluster with election
512  	 * \param[in]     message  Election message XML from peer
513  	 * \param[in]     can_win  Whether local node is eligible to win
514  	 *
515  	 * \return Election state after new vote is considered
516  	 * \note If the peer message is a vote, and we prefer the peer to win, this will
517  	 *       send a no-vote reply to the peer.
518  	 * \note The situations "we lost to this vote" from "this is a late no-vote
519  	 *       after we've already lost" both return election_lost. If a caller needs
520  	 *       to distinguish them, it should save the current state before calling
521  	 *       this function, and then compare the result.
522  	 */
523  	enum election_result
524  	election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message,
525  	                    bool can_win)
526  	{
527  	    int log_level = LOG_INFO;
528  	    gboolean done = FALSE;
529  	    gboolean we_lose = FALSE;
530  	    const char *reason = "unknown";
531  	    bool we_are_owner = FALSE;
532  	    pcmk__node_status_t *our_node = NULL;
533  	    pcmk__node_status_t *your_node = NULL;
534  	    time_t tm_now = time(NULL);
535  	    struct vote vote;
536  	
537  	    CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL)
538  	              && (message != NULL) && (cluster->priv->node_name != NULL),
539  	              return election_error);
540  	
541  	    if (!parse_election_message(message, &vote)) {
542  	        return election_error;
543  	    }
544  	
545  	    your_node = pcmk__get_node(0, vote.from, NULL,
546  	                               pcmk__node_search_cluster_member);
547  	    our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
548  	                              pcmk__node_search_cluster_member);
549  	    we_are_owner = (our_node != NULL)
550  	                   && pcmk__str_eq(pcmk__cluster_get_xml_id(our_node),
551  	                                   vote.election_owner, pcmk__str_none);
552  	
553  	    if (!can_win) {
554  	        reason = "Not eligible";
555  	        we_lose = TRUE;
556  	
557  	    } else if (!pcmk__cluster_is_node_active(our_node)) {
558  	        reason = "We are not part of the cluster";
559  	        log_level = LOG_ERR;
560  	        we_lose = TRUE;
561  	
562  	    } else if (we_are_owner
563  	               && (vote.election_id != cluster->priv->election->count)) {
564  	        log_level = LOG_TRACE;
565  	        reason = "Superseded";
566  	        done = TRUE;
567  	
568  	    } else if (!pcmk__cluster_is_node_active(your_node)) {
569  	        /* Possibly we cached the message in the FSA queue at a point that it wasn't */
570  	        reason = "Peer is not part of our cluster";
571  	        log_level = LOG_WARNING;
572  	        done = TRUE;
573  	
574  	    } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
575  	               || pcmk__str_eq(vote.from, cluster->priv->node_name,
576  	                               pcmk__str_casei)) {
577  	        /* Receiving our own broadcast vote, or a no-vote from peer, is a vote
578  	         * for us to win
579  	         */
580  	        if (!we_are_owner) {
581  	            crm_warn("Cannot count election round %d %s from %s "
582  	                     "because we did not start election (node ID %s did)",
583  	                     vote.election_id, vote.op, vote.from,
584  	                     vote.election_owner);
585  	            return election_error;
586  	        }
587  	        if (cluster->priv->election->state != election_in_progress) {
588  	            // Should only happen if we already lost
589  	            crm_debug("Not counting election round %d %s from %s "
590  	                      "because no election in progress",
591  	                      vote.election_id, vote.op, vote.from);
592  	            return cluster->priv->election->state;
593  	        }
594  	        record_vote(cluster, &vote);
595  	        reason = "Recorded";
596  	        done = TRUE;
597  	
598  	    } else {
599  	        // A peer vote requires a comparison to determine which node is better
600  	        int age_result = compare_age(vote.age);
601  	        int version_result = compare_version(vote.version, CRM_FEATURE_SET);
602  	
603  	        if (version_result < 0) {
604  	            reason = "Version";
605  	            we_lose = TRUE;
606  	
607  	        } else if (version_result > 0) {
608  	            reason = "Version";
609  	
610  	        } else if (age_result < 0) {
611  	            reason = "Uptime";
612  	            we_lose = TRUE;
613  	
614  	        } else if (age_result > 0) {
615  	            reason = "Uptime";
616  	
617  	        } else if (strcasecmp(cluster->priv->node_name, vote.from) > 0) {
618  	            reason = "Host name";
619  	            we_lose = TRUE;
620  	
621  	        } else {
622  	            reason = "Host name";
623  	        }
624  	    }
625  	
626  	    if (cluster->priv->election->expires < tm_now) {
627  	        cluster->priv->election->election_wins = 0;
628  	        cluster->priv->election->expires = tm_now + STORM_INTERVAL;
629  	
630  	    } else if (done == FALSE && we_lose == FALSE) {
631  	        int peers = 1 + g_hash_table_size(pcmk__peer_cache);
632  	
633  	        /* If every node has to vote down every other node, thats N*(N-1) total elections
634  	         * Allow some leeway before _really_ complaining
635  	         */
636  	        cluster->priv->election->election_wins++;
637  	        if (cluster->priv->election->election_wins > (peers * peers)) {
638  	            crm_warn("Election storm detected: %d wins in %d seconds",
639  	                     cluster->priv->election->election_wins, STORM_INTERVAL);
640  	            cluster->priv->election->election_wins = 0;
641  	            cluster->priv->election->expires = tm_now + STORM_INTERVAL;
642  	            if (!(cluster->priv->election->wrote_blackbox)) {
643  	                /* It's questionable whether a black box (from every node in the
644  	                 * cluster) would be truly helpful in diagnosing an election
645  	                 * storm. It's also highly doubtful a production environment
646  	                 * would get multiple election storms from distinct causes, so
647  	                 * saving one blackbox per process lifetime should be
648  	                 * sufficient. Alternatives would be to save a timestamp of the
649  	                 * last blackbox write instead of a boolean, and write a new one
650  	                 * if some amount of time has passed; or to save a storm count,
651  	                 * write a blackbox on every Nth occurrence.
652  	                 */
653  	                crm_write_blackbox(0, NULL);
654  	                cluster->priv->election->wrote_blackbox = true;
655  	            }
656  	        }
657  	    }
658  	
659  	    if (done) {
660  	        do_crm_log(log_level + 1,
661  	                   "Processed election round %u %s (current round %d) "
662  	                   "from %s (%s)",
663  	                   vote.election_id, vote.op, cluster->priv->election->count,
664  	                   vote.from, reason);
665  	        return cluster->priv->election->state;
666  	
667  	    } else if (we_lose == FALSE) {
668  	        /* We track the time of the last election loss to implement an election
669  	         * dampening period, reducing the likelihood of an election storm. If
670  	         * this node has lost within the dampening period, don't start a new
671  	         * election, even if we win against a peer's vote -- the peer we lost to
672  	         * should win again.
673  	         *
674  	         * @TODO This has a problem case: if an election winner immediately
675  	         * leaves the cluster, and a new election is immediately called, all
676  	         * nodes could lose, with no new winner elected. The ideal solution
677  	         * would be to tie the election structure with the peer caches, which
678  	         * would allow us to clear the dampening when the previous winner
679  	         * leaves (and would allow other improvements as well).
680  	         */
681  	        if ((cluster->priv->election->last_election_loss == 0)
682  	            || ((tm_now - cluster->priv->election->last_election_loss)
683  	                > (time_t) LOSS_DAMPEN)) {
684  	
685  	            do_crm_log(log_level,
686  	                       "Election round %d (started by node ID %s) pass: "
687  	                       "%s from %s (%s)",
688  	                       vote.election_id, vote.election_owner, vote.op,
689  	                       vote.from, reason);
690  	
691  	            cluster->priv->election->last_election_loss = 0;
692  	            election_timeout_stop(cluster);
693  	
694  	            /* Start a new election by voting down this, and other, peers */
695  	            cluster->priv->election->state = election_start;
696  	            return cluster->priv->election->state;
697  	        } else {
698  	            char *loss_time = NULL;
699  	
700  	            loss_time = ctime(&(cluster->priv->election->last_election_loss));
701  	            if (loss_time) {
702  	                // Show only HH:MM:SS
703  	                loss_time += 11;
704  	                loss_time[8] = '\0';
705  	            }
706  	            crm_info("Ignoring election round %d (started by node ID %s) pass "
707  	                     "vs %s because we lost less than %ds ago at %s",
708  	                     vote.election_id, vote.election_owner, vote.from,
709  	                     LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
710  	        }
711  	    }
712  	
713  	    cluster->priv->election->last_election_loss = tm_now;
714  	
715  	    do_crm_log(log_level,
716  	               "Election round %d (started by node ID %s) lost: "
717  	               "%s from %s (%s)",
718  	               vote.election_id, vote.election_owner, vote.op,
719  	               vote.from, reason);
720  	
721  	    election_reset(cluster);
722  	    send_no_vote(cluster, your_node, &vote);
723  	    cluster->priv->election->state = election_lost;
724  	    return cluster->priv->election->state;
725  	}
726  	
727  	/*!
728  	 * \internal
729  	 * \brief Reset any election dampening currently in effect
730  	 *
731  	 * \param[in,out] cluster  Cluster with election
732  	 */
733  	void
734  	election_clear_dampening(pcmk_cluster_t *cluster)
735  	{
736  	    if ((cluster != NULL) && (cluster->priv->election != NULL)) {
737  	        cluster->priv->election->last_election_loss = 0;
738  	    }
739  	}
740