1 /*
2 * Copyright 2004-2012 Red Hat, Inc.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v2 or (at your option) any later version.
7 */
8
9 #include "dlm_daemon.h"
10
11 /* protocol_version flags */
12 #define PV_STATEFUL 0x0001
13
14 /* retries are once a second */
15 #define log_retry(cur_count, fmt, args...) ({ \
16 if (cur_count < 60) \
17 log_debug(fmt, ##args); \
18 else if (cur_count == 60) \
19 log_error(fmt, ##args); \
20 else if (!(cur_count % 3600)) \
21 log_error(fmt, ##args); \
22 })
23
24 struct protocol_version {
25 uint16_t major;
26 uint16_t minor;
27 uint16_t patch;
28 uint16_t flags;
29 };
30
31 struct protocol {
32 union {
33 struct protocol_version dm_ver;
34 uint16_t daemon_max[4];
35 };
36 union {
37 struct protocol_version km_ver;
38 uint16_t kernel_max[4];
39 };
40 union {
41 struct protocol_version dr_ver;
42 uint16_t daemon_run[4];
43 };
44 union {
45 struct protocol_version kr_ver;
46 uint16_t kernel_run[4];
47 };
48 };
49
50 /* fence_result flags */
51 #define FR_FIPU 0x00000001
52 #define FR_CLEAR_STARTUP 0x00000002
53 #define FR_CLEAR_FIPU 0x00000004
54
55 struct fence_result {
56 uint32_t version;
57 uint32_t flags;
58 uint32_t nodeid;
59 uint32_t result;
60 uint64_t fence_walltime;
61 char unused[1000];
62 };
63
64 struct node_daemon {
65 struct list_head list;
66 int nodeid;
67 int killed;
68 int daemon_member;
69 int left_reason;
70 int recover_setup;
71 int fence_in_progress_unknown;
72 int need_fence_clear;
73 int need_fencing;
74 int delay_fencing;
75 int stateful_merge;
76 int fence_pid;
77 int fence_pid_wait;
78 int fence_result_wait;
79 int fence_actor_done; /* for status/debug */
80 int fence_actor_last; /* for status/debug */
81 int fence_actors[MAX_NODES];
82 int fence_actors_orig[MAX_NODES];
83
84 struct protocol proto;
85 struct fence_config fence_config;
86
87 uint64_t daemon_add_time;
88 uint64_t daemon_rem_time;
89 uint64_t fail_walltime;
90 uint64_t fail_monotime;
91 uint64_t fence_walltime;
92 uint64_t fence_monotime;
93 };
94
95 #define REASON_STARTUP_FENCING -1
96
97 static cpg_handle_t cpg_handle_daemon;
98 static int cpg_fd_daemon;
99 static struct protocol our_protocol;
100 static struct list_head daemon_nodes;
101 static struct list_head startup_nodes;
102 static struct cpg_address daemon_member[MAX_NODES];
103 static struct cpg_address daemon_joined[MAX_NODES];
104 static struct cpg_address daemon_remove[MAX_NODES];
105 static int daemon_member_count;
106 static int daemon_joined_count;
107 static int daemon_remove_count;
108 static int daemon_ringid_wait;
109 static struct cpg_ring_id daemon_ringid;
110 static int daemon_fence_pid;
111 static uint32_t last_join_seq;
112 static uint32_t send_fipu_seq;
113 static int wait_clear_fipu;
114 static int fence_in_progress_unknown = 1;
115
116 #define MAX_ZOMBIES 16
117 static int zombie_pids[MAX_ZOMBIES];
118 static int zombie_count;
119
120 static int fence_result_pid;
121 static unsigned int fence_result_try;
122 static int stateful_merge_wait; /* cluster is stuck in waiting for manual intervention */
123
124 static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime);
125 static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
126
127 void log_config(const struct cpg_name *group_name,
128 const struct cpg_address *member_list,
129 size_t member_list_entries,
130 const struct cpg_address *left_list,
131 size_t left_list_entries,
132 const struct cpg_address *joined_list,
133 size_t joined_list_entries)
134 {
135 char m_buf[128];
136 char j_buf[32];
137 char l_buf[32];
138 size_t i, len, pos;
139 int ret;
140
141 memset(m_buf, 0, sizeof(m_buf));
142 memset(j_buf, 0, sizeof(j_buf));
143 memset(l_buf, 0, sizeof(l_buf));
144
145 len = sizeof(m_buf);
146 pos = 0;
147 for (i = 0; i < member_list_entries; i++) {
148 ret = snprintf(m_buf + pos, len - pos, " %d",
149 member_list[i].nodeid);
150 if (ret >= len - pos)
151 break;
152 pos += ret;
153 }
154
155 len = sizeof(j_buf);
156 pos = 0;
157 for (i = 0; i < joined_list_entries; i++) {
158 ret = snprintf(j_buf + pos, len - pos, " %d",
159 joined_list[i].nodeid);
160 if (ret >= len - pos)
161 break;
162 pos += ret;
163 }
164
165 len = sizeof(l_buf);
166 pos = 0;
167 for (i = 0; i < left_list_entries; i++) {
168 ret = snprintf(l_buf + pos, len - pos, " %d",
169 left_list[i].nodeid);
170 if (ret >= len - pos)
171 break;
172 pos += ret;
173 }
174
175 log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value,
176 member_list_entries, joined_list_entries, left_list_entries,
177 strlen(m_buf) ? m_buf : " 0", strlen(j_buf) ? j_buf : " 0",
178 strlen(l_buf) ? l_buf : " 0");
179 }
180
181 void log_ringid(const char *name,
182 struct cpg_ring_id *ringid,
183 const uint32_t *member_list,
184 size_t member_list_entries)
185 {
186 char m_buf[128];
187 size_t i, len, pos;
188 int ret;
189
190 memset(m_buf, 0, sizeof(m_buf));
191
192 len = sizeof(m_buf);
193 pos = 0;
194 for (i = 0; i < member_list_entries; i++) {
195 ret = snprintf(m_buf + pos, len - pos, " %u",
196 member_list[i]);
197 if (ret >= len - pos)
198 break;
199 pos += ret;
200 }
201
202 log_debug("%s ring %u:%llu %zu memb%s",
203 name, ringid->nodeid, (unsigned long long)ringid->seq,
204 member_list_entries, m_buf);
205 }
206
207 const char *reason_str(int reason)
208 {
209 switch (reason) {
210 case REASON_STARTUP_FENCING:
211 return "startup";
212 case CPG_REASON_JOIN:
213 return "join";
214 case CPG_REASON_LEAVE:
215 return "leave";
216 case CPG_REASON_NODEDOWN:
217 return "nodedown";
218 case CPG_REASON_NODEUP:
219 return "nodeup";
220 case CPG_REASON_PROCDOWN:
221 return "procdown";
222 default:
223 return "unknown";
224 };
225 }
226
227 const char *msg_name(int type)
228 {
229 switch (type) {
230 case DLM_MSG_PROTOCOL:
231 return "protocol";
232 case DLM_MSG_FENCE_RESULT:
233 return "fence_result";
234 case DLM_MSG_FENCE_CLEAR:
235 return "fence_clear";
236
237 case DLM_MSG_START:
238 return "start";
239 case DLM_MSG_PLOCK:
240 return "plock";
241 case DLM_MSG_PLOCK_OWN:
242 return "plock_own";
243 case DLM_MSG_PLOCK_DROP:
244 return "plock_drop";
245 case DLM_MSG_PLOCK_SYNC_LOCK:
246 return "plock_sync_lock";
247 case DLM_MSG_PLOCK_SYNC_WAITER:
248 return "plock_sync_waiter";
249 case DLM_MSG_PLOCKS_DATA:
250 return "plocks_data";
251 case DLM_MSG_PLOCKS_DONE:
252 return "plocks_done";
253 case DLM_MSG_DEADLK_CYCLE_START:
254 return "deadlk_cycle_start";
255 case DLM_MSG_DEADLK_CYCLE_END:
256 return "deadlk_cycle_end";
257 case DLM_MSG_DEADLK_CHECKPOINT_READY:
258 return "deadlk_checkpoint_ready";
259 case DLM_MSG_DEADLK_CANCEL_LOCK:
260 return "deadlk_cancel_lock";
261 default:
262 return "unknown";
263 }
264 }
265
266 static int _send_message(cpg_handle_t h, void *buf, int len, int type)
267 {
268 struct iovec iov;
269 cs_error_t error;
270 int retries = 0;
271
272 iov.iov_base = buf;
273 iov.iov_len = len;
274
275 retry:
276 error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
277 if (error == CS_ERR_TRY_AGAIN) {
278 retries++;
279 usleep(1000);
280 if (!(retries % 100))
281 log_error("cpg_mcast_joined retry %d %s",
282 retries, msg_name(type));
283 goto retry;
284 }
285 if (error != CS_OK) {
286 log_error("cpg_mcast_joined error %d handle %llx %s",
287 error, (unsigned long long)h, msg_name(type));
288 return -1;
289 }
290
291 if (retries)
292 log_debug("cpg_mcast_joined retried %d %s",
293 retries, msg_name(type));
294
295 return 0;
296 }
297
298 /* header fields caller needs to set: type, to_nodeid, flags, msgdata */
299
300 void dlm_send_message(struct lockspace *ls, char *buf, int len)
301 {
302 struct dlm_header *hd = (struct dlm_header *) buf;
303 int type = hd->type;
304
305 hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]);
306 hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]);
307 hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]);
308 hd->type = cpu_to_le16(hd->type);
309 hd->nodeid = cpu_to_le32(our_nodeid);
310 hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
311 hd->global_id = cpu_to_le32(ls->global_id);
312 hd->flags = cpu_to_le32(hd->flags);
313 hd->msgdata = cpu_to_le32(hd->msgdata);
314 hd->msgdata2 = cpu_to_le32(hd->msgdata2);
315
316 _send_message(ls->cpg_handle, buf, len, type);
317 }
318
319 int dlm_send_message_daemon(char *buf, int len)
320 {
321 struct dlm_header *hd = (struct dlm_header *) buf;
322 int type = hd->type;
323
324 hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]);
325 hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]);
326 hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]);
327 hd->type = cpu_to_le16(hd->type);
328 hd->nodeid = cpu_to_le32(our_nodeid);
329 hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
330 hd->flags = cpu_to_le32(hd->flags);
331 hd->msgdata = cpu_to_le32(hd->msgdata);
332 hd->msgdata2 = cpu_to_le32(hd->msgdata2);
333
334 return _send_message(cpg_handle_daemon, buf, len, type);
335 }
336
337 void dlm_header_in(struct dlm_header *hd)
338 {
339 hd->version[0] = le16_to_cpu(hd->version[0]);
340 hd->version[1] = le16_to_cpu(hd->version[1]);
341 hd->version[2] = le16_to_cpu(hd->version[2]);
342 hd->type = le16_to_cpu(hd->type);
343 hd->nodeid = le32_to_cpu(hd->nodeid);
344 hd->to_nodeid = le32_to_cpu(hd->to_nodeid);
345 hd->global_id = le32_to_cpu(hd->global_id);
346 hd->flags = le32_to_cpu(hd->flags);
347 hd->msgdata = le32_to_cpu(hd->msgdata);
348 hd->msgdata2 = le32_to_cpu(hd->msgdata2);
349 }
350
351 static void run_info_out(struct run_info *info)
352 {
353 info->dest_nodeid = cpu_to_le32(info->dest_nodeid);
354 info->start_nodeid = cpu_to_le32(info->start_nodeid);
355 info->local_pid = cpu_to_le32(info->local_pid);
356 info->local_result = cpu_to_le32(info->local_result);
357 info->need_replies = cpu_to_le32(info->need_replies);
358 info->reply_count = cpu_to_le32(info->reply_count);
359 info->fail_count = cpu_to_le32(info->fail_count);
360 info->flags = cpu_to_le32(info->flags);
361 }
362
363 static void run_info_in(struct run_info *info)
364 {
365 info->dest_nodeid = le32_to_cpu(info->dest_nodeid);
366 info->start_nodeid = le32_to_cpu(info->start_nodeid);
367 info->local_pid = le32_to_cpu(info->local_pid);
368 info->local_result = le32_to_cpu(info->local_result);
369 info->need_replies = le32_to_cpu(info->need_replies);
370 info->reply_count = le32_to_cpu(info->reply_count);
371 info->fail_count = le32_to_cpu(info->fail_count);
372 info->flags = le32_to_cpu(info->flags);
373 }
374
375 static void run_request_out(struct run_request *req)
376 {
377 run_info_out(&req->info);
378 }
379
380 static void run_request_in(struct run_request *req)
381 {
382 run_info_in(&req->info);
383 }
384
385 static void run_reply_out(struct run_reply *rep)
386 {
387 run_info_out(&rep->info);
388 }
389
390 static void run_reply_in(struct run_reply *rep)
391 {
392 run_info_in(&rep->info);
393 }
394
395 int dlm_header_validate(struct dlm_header *hd, int nodeid)
396 {
397 if (hd->version[0] != our_protocol.daemon_run[0] ||
398 hd->version[1] != our_protocol.daemon_run[1]) {
399 log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
400 nodeid, hd->version[0], hd->version[1],
401 hd->version[2], our_protocol.daemon_run[0],
402 our_protocol.daemon_run[1],
403 our_protocol.daemon_run[2]);
404 return -1;
405 }
406
407 if (hd->nodeid != nodeid) {
408 log_error("bad msg nodeid %d %d", hd->nodeid, nodeid);
409 return -1;
410 }
411
412 return 0;
413 }
414
415 static struct node_daemon *get_node_daemon(int nodeid)
416 {
417 struct node_daemon *node;
418
419 list_for_each_entry(node, &daemon_nodes, list) {
420 if (node->nodeid == nodeid)
421 return node;
422 }
423 return NULL;
424 }
425
426 static int nodes_need_fencing(void)
427 {
428 struct node_daemon *node;
429
430 list_for_each_entry(node, &daemon_nodes, list) {
431 if (node->need_fencing)
432 return 1;
433 }
434 return 0;
435 }
436
437 static int nodeid_needs_fencing(int nodeid)
438 {
439 struct node_daemon *node;
440
441 node = get_node_daemon(nodeid);
442 if (!node) {
443 log_error("nodeid_needs_fencing %d not found", nodeid);
444 return 0;
445 }
446 return node->need_fencing;
447 }
448
449 static int all_daemon_members_fipu(void)
450 {
451 struct node_daemon *node;
452
453 list_for_each_entry(node, &daemon_nodes, list) {
454 if (!node->daemon_member)
455 continue;
456 if (!node->fence_in_progress_unknown)
457 return 0;
458 }
459
460 list_for_each_entry(node, &daemon_nodes, list) {
461 if (!node->daemon_member)
462 continue;
463 node->fence_in_progress_unknown = 0;
464 }
465
466 return 1;
467 }
468
469 int fence_node_time(int nodeid, uint64_t *last_fenced)
470 {
471 struct node_daemon *node;
472
473 node = get_node_daemon(nodeid);
474 if (!node)
475 return -1;
476
477 *last_fenced = node->fence_monotime;
478 return 0;
479 }
480
481 int fence_in_progress(int *in_progress)
482 {
483 if (fence_in_progress_unknown) {
484 *in_progress = 1;
485 } else if (!list_empty(&startup_nodes)) {
486 *in_progress = 2;
487 } else if (nodes_need_fencing()) {
488 *in_progress = 3;
489 } else {
490 *in_progress = 0;
491 }
492 return 0;
493 }
494
495 void add_startup_node(int nodeid)
496 {
497 struct node_daemon *node;
498
499 node = malloc(sizeof(struct node_daemon));
500 if (!node) {
501 log_error("add_startup_node no mem");
502 return;
503 }
504 memset(node, 0, sizeof(struct node_daemon));
505 node->nodeid = nodeid;
506 list_add_tail(&node->list, &startup_nodes);
507 }
508
509 static int clear_startup_node(int nodeid, int all)
510 {
511 struct node_daemon *node, *safe;
512 int count = 0;
513
514 list_for_each_entry_safe(node, safe, &startup_nodes, list) {
515 if (all || node->nodeid == nodeid) {
516 list_del(&node->list);
517 free(node);
518 count++;
519 }
520 }
521 return count;
522 }
523
524 static struct node_daemon *add_node_daemon(int nodeid)
525 {
526 struct node_daemon *node;
527 struct fence_config *fc;
528 int rv;
529
530 node = get_node_daemon(nodeid);
531 if (node)
532 return node;
533
534 node = malloc(sizeof(struct node_daemon));
535 if (!node) {
536 log_error("add_node_daemon no mem");
537 return NULL;
538 }
539 memset(node, 0, sizeof(struct node_daemon));
540 node->nodeid = nodeid;
541 list_add_tail(&node->list, &daemon_nodes);
542
543 /* TODO: allow the config to be reread */
544
545 fc = &node->fence_config;
546 fc->nodeid = nodeid;
547
548 /* explicit config file setting */
549
550 rv = fence_config_init(fc, (unsigned int)nodeid, (char *)CONF_FILE_PATH);
551 if (!rv)
552 goto out;
553
554 /* no config file setting, so use default */
555
556 if (rv == -ENOENT) {
557 fc->dev[0] = &fence_all_device;
558 goto out;
559 }
560
561 log_error("fence config %d error %d", nodeid, rv);
562 out:
563 return node;
564 }
565
566 /* A clean daemon member is a node that has joined the daemon cpg
567 from a "clean state", i.e. not a stateful merge. If would not
568 have joined the daemon cpg if it found uncontrolled dlm kernel
569 state (check_uncontrolled_lockspaces). We would not have
570 accepted and saved its protocol in node->proto.daemon if it
571 was a stateful merge. */
572
573 static int is_clean_daemon_member(int nodeid)
574 {
575 struct node_daemon *node;
576
577 node = get_node_daemon(nodeid);
578 if (node && node->daemon_member && node->proto.daemon_max[0])
579 return 1;
580 return 0;
581 }
582
583 static int in_daemon_list(int nodeid, struct cpg_address *daemon_list, int count)
584 {
585 int i;
586
587 for (i = 0; i < count; i++) {
588 if (daemon_list[i].nodeid == nodeid)
589 return 1;
590 }
591 return 0;
592 }
593
594 /* save in node->fence_actors[] any nodeid present when the node
595 failed which therefore saw it fail, knows it needs fencing, and
596 can request fencing for it if it becomes the low actor. A node
597 added in the same change with the removed node does not qualify. */
598
599 static int set_fence_actors(struct node_daemon *node, int all_memb)
600 {
601 int i, nodeid, count = 0, low = 0;
602
603 memset(node->fence_actors, 0, sizeof(node->fence_actors));
604 memset(node->fence_actors_orig, 0, sizeof(node->fence_actors_orig));
605
606 for (i = 0; i < daemon_member_count; i++) {
607 nodeid = daemon_member[i].nodeid;
608
609 if (!all_memb && in_daemon_list(nodeid, daemon_joined, daemon_joined_count))
610 continue;
611
612 node->fence_actors[count++] = nodeid;
613
614 if (!low || nodeid < low)
615 low = nodeid;
616 }
617
618 /* keep a copy of the original set so they can be retried if all fail */
619 memcpy(node->fence_actors_orig, node->fence_actors, sizeof(node->fence_actors));
620
621 log_debug("set_fence_actors for %d low %d count %d",
622 node->nodeid, low, count);
623 return low;
624 }
625
626 static int get_fence_actor(struct node_daemon *node)
627 {
628 int i, low, low_i;
629
630 retry:
631 low = 0;
632
633 for (i = 0; i < MAX_NODES; i++) {
634 if (!node->fence_actors[i])
635 continue;
636
637 if (!low || node->fence_actors[i] < low) {
638 low = node->fence_actors[i];
639 low_i = i;
640 }
641 }
642
643 if (low && !in_daemon_list(low, daemon_member, daemon_member_count)) {
644 log_debug("get_fence_actor for %d low actor %d is gone",
645 node->nodeid, low);
646
647 node->fence_actors[low_i] = 0;
648 goto retry;
649 }
650
651 node->fence_actor_last = low;
652
653 return low;
654 }
655
656 /* if an actor fails to fence, it will send that result, and others
657 will clear it from the actors, which will cause the next lowest
658 actor to try */
659
660 static void clear_fence_actor(int nodeid, int actor)
661 {
662 struct node_daemon *node;
663 int remaining = 0;
664 int i;
665
666 node = get_node_daemon(nodeid);
667 if (!node)
668 return;
669
670 for (i = 0; i < MAX_NODES; i++) {
671 if (node->fence_actors[i] == actor)
672 node->fence_actors[i] = 0;
673 else if (node->fence_actors[i])
674 remaining++;
675 }
676
677 if (!remaining && opt(repeat_failed_fencing_ind)) {
678 log_debug("clear_fence_actor %d restoring original actors to retry", actor);
679 memcpy(node->fence_actors, node->fence_actors_orig, sizeof(node->fence_actors));
680 }
681 }
682
683 static void clear_zombies(void)
684 {
685 int i, rv, result = 0;
686
687 for (i = 0; i < MAX_ZOMBIES; i++) {
688 if (!zombie_count)
689 break;
690 if (!zombie_pids[i])
691 continue;
692
693 rv = fence_result(-1, zombie_pids[i], &result);
694 if (rv == -EAGAIN)
695 continue;
696
697 log_debug("cleared zombie %d rv %d result %d",
698 zombie_pids[i], rv, result);
699
700 zombie_pids[i] = 0;
701 zombie_count--;
702 }
703 }
704
705 static void add_zombie(int pid)
706 {
707 int i;
708
709 for (i = 0; i < MAX_ZOMBIES; i++) {
710 if (!zombie_pids[i]) {
711 zombie_pids[i] = pid;
712 zombie_count++;
713 return;
714 }
715 }
716 }
717
718 static void fence_pid_cancel(int nodeid, int pid)
719 {
720 int rv, result = 0;
721
722 log_debug("fence_pid_cancel nodeid %d pid %d sigkill", nodeid, pid);
723
724 kill(pid, SIGKILL);
725 usleep(500000);
726
727 rv = fence_result(nodeid, pid, &result);
728 if (rv == -EAGAIN)
729 add_zombie(pid);
730
731 log_debug("fence_pid_cancel nodeid %d pid %d rv %d result %d",
732 nodeid, pid, rv, result);
733 }
734
735 static void kick_stateful_merge_members(void)
736 {
737 struct node_daemon *node;
738
739 list_for_each_entry(node, &daemon_nodes, list) {
740 if (!node->killed && node->stateful_merge) {
741 log_error("daemon node %d kill stateful merge member",
742 node->nodeid);
743 kick_node_from_cluster(node->nodeid);
744 node->killed = 1;
745 }
746 }
747 }
748
749 /*
750 * fence_in_progress_unknown (fipu)
751 *
752 * If current daemon members are fencing someone, and a new node
753 * joins, that new node needs to wait for the previous members to
754 * finish any fencing they're doing before it can start a lockspace.
755 *
756 * The previous members may be fencing the last node that was using
757 * the lockspace the new node is going to use, so if it doesn't wait,
758 * it could start using a lockspace with an unfenced user.
759 *
760 * So, the daemon starts with fence_in_progress_unknown set to
761 * indicate that other nodes may be fencing someone, and it won't
762 * start any lockspaces until it is clear.
763 *
764 * A node starts with fence_in_progress_unknown set and won't
765 * start any lockspaces until it's clear.
766 *
767 * When using startup_fencing:
768 *
769 * . When all nodes start up together, all have fipu set,
770 * and will go through startup fencing, which will eventually
771 * result in all nodes either being clean daemon members or fenced,
772 * so everyone will clear fipu by seeing that.
773 *
774 * . The more common case is when a new node joins other previously
775 * running nodes. The new node needs to be told that the others
776 * have no outstanding fencing ops before it can clear fipu.
777 * A previous member does send_fence_clear(0) to a new node once
778 * all fencing is complete. The two flags in send_fence_clear are
779 * usually sent together but may sometimes may be in separate messages:
780 * send_fence_clear(0, CLEAR_STARTUP) to clear startup_nodes right away
781 * send_fence_clear(0, CLEAR_FIPU) to clear fipu once all fencing is done
782 *
783 * When not using startup_fencing:
784 *
785 * . When all nodes start up together, all have fipu set, and all
786 * will be waiting to receive_fence_clear from a previous node
787 * in order to clear it. The nodes need to detect this situation,
788 * and when they do, they will know that everyone is in startup,
789 * so there can be no pending fencing on a previous node, so all
790 * can clear fipu. To detect this case, when a node starts up
791 * with !startup_fence, it sends a special send_fence_clear(-ENODATA, FIPU)
792 * message about itself to indicate it has fipu set and needs it cleared.
793 * After sending this, it checks to see if all present nodes have sent
794 * this same message about themselves. If so, then this startup
795 * case has been detected, an all will clear fipu.
796 *
797 * . New nodes that join after this startup initialization will be
798 * handled the same as when startup_fencing is set (above).
799 *
800 *
801 * startup_fencing
802 * ---------------
803 *
804 * case A
805 * all nodes start up,
806 * all have fipu set,
807 * all wait for startup_nodes to be empty, (joined or moved to need_fencing)
808 * all wait for no daemon_nodes to need_fencing, (joined or were fenced)
809 * all clear fipu
810 *
811 * later,
812 *
813 * case B
814 * new node starts,
815 * new node has fipu set,
816 * cur node sees need_fence_clear on new node
817 * cur node sees no pending fencing ops,
818 * cur node send_fence_clear(0) to new node,
819 * new node clears startup_nodes and fipu
820 *
821 * !startup_fencing
822 * ----------------
823 *
824 * case C
825 * all nodes start up,
826 * all have fipu set,
827 * all send_fence_clear(-ENODATA,FIPU),
828 * all receive_fence_clear(-ENODATA,FIPU) from everyone,
829 * all_daemon_members_fipu() is 1,
830 * all clear fipu
831 *
832 * later same as case B above
833 */
834
835 static void daemon_fence_work(void)
836 {
837 struct node_daemon *node, *safe;
838 int gone_count = 0, part_count = 0, merge_count = 0, clean_count = 0;
839 int rv, nodeid, pid, need, low = 0, actor, result;
840 int retry = 0;
841 uint32_t flags;
842
843 if (!daemon_fence_allow)
844 return;
845
846 if (daemon_ringid_wait) {
847 /* We've seen a nodedown confchg callback, but not the
848 corresponding ringid callback. */
849 log_retry(retry_fencing, "fence work wait for cpg ringid");
850 retry = 1;
851 goto out;
852 }
853
854 if (cluster_ringid_seq != daemon_ringid.seq) {
855 /* wait for ringids to be in sync */
856 log_retry(retry_fencing, "fence work wait for cluster ringid");
857 retry = 1;
858 goto out;
859 }
860
861 if (opt(enable_quorum_fencing_ind) && !cluster_quorate) {
862 /* wait for quorum before doing any fencing, but if there
863 is none, send_fence_clear below can unblock new nodes */
864 log_retry(retry_fencing, "fence work wait for quorum");
865 retry = 1;
866 goto out_fipu;
867 }
868
869 /*
870 * Count different types of nodes
871 * gone: node not a member
872 * part: member we've not received a proto message from
873 * merge: member we received a stateful proto message from
874 * clean: member we received a clean/new proto message from
875 *
876 * A node always views itself as a clean member, not a merge member.
877 */
878
879 list_for_each_entry(node, &daemon_nodes, list) {
880 if (!node->daemon_member) {
881 gone_count++;
882 } else {
883 if (!low || node->nodeid < low)
884 low = node->nodeid;
885
886 if (node->stateful_merge)
887 merge_count++;
888 else if (!node->proto.daemon_max[0])
889 part_count++;
890 else
891 clean_count++;
892 }
893 }
894
895 /*
896 * Wait for stateful merged members to be removed before moving
897 * on to fencing. Kill stateful merged members to clear them.
898 * This section is only relevant to non-two-node, even splits.
899 *
900 * With two node splits, they race to fence each other and
901 * whichever fences successfully then kills corosync on the other
902 * (in the case where corosync is still running on the fenced node).
903 *
904 * With an odd split, the partition that maintained quorum will
905 * kill stateful merged nodes when their proto message is received.
906 *
907 * With an even split, e.g. 2/2, we don't want both sets to
908 * be fencing each other right after merge, when both sides
909 * have quorum again and see the other side as statefully merged.
910 * So, delay fencing until the stateful nodes are cleared on one
911 * side (by way of the low nodeid killing stateful merged members).
912 *
913 * When there are 3 or more partitions that merge, none may see
914 * enough clean nodes, so the cluster would be stuck here waiting
915 * for someone to manually reset/restart enough nodes to produce
916 * sufficient clean nodes (>= merged).
917 */
918
919 if (!cluster_two_node && merge_count) {
920 log_retry(retry_fencing, "fence work wait to clear merge %d clean %d part %d gone %d",
921 merge_count, clean_count, part_count, gone_count);
922
923 if ((clean_count >= merge_count) && !part_count && (low == our_nodeid))
924 kick_stateful_merge_members();
925 if ((clean_count < merge_count) && !part_count)
926 stateful_merge_wait = 1;
927
928 retry = 1;
929 goto out;
930 }
931 if (stateful_merge_wait)
932 stateful_merge_wait = 0;
933
934 /*
935 * startup fencing
936 */
937
938 list_for_each_entry_safe(node, safe, &startup_nodes, list) {
939 if (is_clean_daemon_member(node->nodeid)) {
940 log_debug("fence startup %d skip member", node->nodeid);
941 list_del(&node->list);
942 free(node);
943 continue;
944 }
945
946 if (!opt(enable_startup_fencing_ind))
947 continue;
948
949 if (!fence_delay_begin) {
950 log_debug("fence startup %d wait for initial delay", node->nodeid);
951 continue;
952 }
953
954 if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) {
955 log_debug("fence startup %d delay %d from %llu",
956 node->nodeid, opt(post_join_delay_ind),
957 (unsigned long long)fence_delay_begin);
958 retry = 1;
959 continue;
960 }
961
962 /* clear this entry and create a daemon_nodes entry with
963 need_fencing and the fence loops below will handle it */
964
965 nodeid = node->nodeid;
966 list_del(&node->list);
967 free(node);
968
969 node = add_node_daemon(nodeid);
970 if (!node) {
971 log_debug("fence startup %d add failed", nodeid);
972 continue;
973 }
974 if (node->need_fencing) {
975 /* don't think this should happen? */
976 log_error("fence startup %d already set", nodeid);
977 continue;
978 }
979 node->need_fencing = 1;
980 node->delay_fencing = 0;
981 node->fence_monotime = 0;
982 node->fence_walltime = 0;
983 node->fence_actor_last = 0;
984 node->fence_actor_done = 0;
985 node->fence_pid_wait = 0;
986 node->fence_pid = 0;
987 node->fence_result_wait = 0;
988 node->fence_config.pos = 0;
989 node->left_reason = REASON_STARTUP_FENCING;
990 node->fail_monotime = cluster_joined_monotime - 1;
991 node->fail_walltime = cluster_joined_walltime - 1;
992 low = set_fence_actors(node, 1);
993
994 log_debug("fence startup nodeid %d act %d", node->nodeid, low);
995 }
996
997 /*
998 * request fencing
999 */
1000
1001 list_for_each_entry(node, &daemon_nodes, list) {
1002 if (!node->need_fencing)
1003 continue;
1004
1005 if (node->fence_pid_wait)
1006 continue;
1007
1008 if (node->fence_result_wait) {
1009 log_debug("fence request %d result_wait", node->nodeid);
1010 continue;
1011 }
1012
1013 if (is_clean_daemon_member(node->nodeid)) {
1014 /*
1015 * node has rejoined in clean state
1016 */
1017 log_debug("fence request %d skip for is_clean_daemon_member", node->nodeid);
1018
1019 node->need_fencing = 0;
1020 node->delay_fencing = 0;
1021 node->fence_walltime = time(NULL);
1022 node->fence_monotime = monotime();
1023 node->fence_actor_done = node->nodeid;
1024 continue;
1025 }
1026
1027 if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) {
1028 /* run one agent at a time in case they need the same switch */
1029 log_retry(retry_fencing, "fence request %d delay for other pid %d",
1030 node->nodeid, daemon_fence_pid);
1031 node->delay_fencing = 1;
1032 retry = 1;
1033 continue;
1034 }
1035
1036 /* use post_join_delay to avoid fencing a node in the short
1037 time between it joining the cluster (giving cluster quorum)
1038 and joining the daemon cpg, which allows it to bypass fencing */
1039
1040 if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) {
1041 log_debug("fence request %d delay %d from %llu",
1042 node->nodeid, opt(post_join_delay_ind),
1043 (unsigned long long)fence_delay_begin);
1044 node->delay_fencing = 1;
1045 retry = 1;
1046 continue;
1047 }
1048 node->delay_fencing = 0;
1049
1050 /* get_fence_actor picks the low nodeid that existed
1051 when node failed and is still around. if the current
1052 actor fails, get_fence_actor will not find it in the
1053 members list, will clear it, and return the next actor */
1054
1055 actor = get_fence_actor(node);
1056
1057 if (!actor) {
1058 log_error("fence request %d no actor", node->nodeid);
1059 continue;
1060 }
1061
1062 if (actor != our_nodeid) {
1063 log_debug("fence request %d defer to %d",
1064 node->nodeid, actor);
1065 continue;
1066 }
1067
1068 log_debug("fence request %d pos %d",
1069 node->nodeid, node->fence_config.pos);
1070
1071 rv = fence_request(node->nodeid,
1072 node->fail_walltime,
1073 node->fail_monotime,
1074 &node->fence_config,
1075 node->left_reason,
1076 &pid);
1077 if (rv < 0) {
1078 send_fence_result(node->nodeid, rv, 0, time(NULL));
1079 node->fence_result_wait = 1;
1080 continue;
1081 }
1082
1083 node->fence_pid_wait = 1;
1084 node->fence_pid = pid;
1085 daemon_fence_pid = pid;
1086 }
1087
1088 /*
1089 * check outstanding fence requests
1090 */
1091
1092 list_for_each_entry(node, &daemon_nodes, list) {
1093 if (!node->need_fencing)
1094 continue;
1095
1096 if (node->delay_fencing)
1097 continue;
1098
1099 if (node->fence_result_wait) {
1100 log_debug("fence wait %d result_wait", node->nodeid);
1101 continue;
1102 }
1103
1104 if (!node->fence_pid_wait) {
1105 /* another node is the actor */
1106 log_debug("fence wait %d for done", node->nodeid);
1107 continue;
1108 }
1109
1110 if (!node->fence_pid) {
1111 /* shouldn't happen */
1112 log_error("fence wait %d zero pid", node->nodeid);
1113 node->fence_pid_wait = 0;
1114 continue;
1115 }
1116
1117 nodeid = node->nodeid;
1118 pid = node->fence_pid;
1119
1120 if (is_clean_daemon_member(nodeid)) {
1121 /*
1122 * node has rejoined in clean state so we can
1123 * abort outstanding fence op for it. all nodes
1124 * will see and do this, so we don't need to send
1125 * a fence result.
1126 */
1127 log_debug("fence wait %d pid %d skip for is_clean_daemon_member", nodeid, pid);
1128
1129 node->need_fencing = 0;
1130 node->delay_fencing = 0;
1131 node->fence_walltime = time(NULL);
1132 node->fence_monotime = monotime();
1133 node->fence_actor_done = nodeid;
1134
1135 node->fence_pid_wait = 0;
1136 node->fence_pid = 0;
1137 daemon_fence_pid = 0;
1138
1139 fence_pid_cancel(nodeid, pid);
1140 continue;
1141 }
1142
1143 retry = 1;
1144
1145 rv = fence_result(nodeid, pid, &result);
1146 if (rv == -EAGAIN) {
1147 /* agent pid is still running */
1148
1149 if (fence_result_pid != pid) {
1150 fence_result_try = 0;
1151 fence_result_pid = pid;
1152 }
1153 fence_result_try++;
1154
1155 log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid);
1156 continue;
1157 }
1158
1159 node->fence_pid_wait = 0;
1160 node->fence_pid = 0;
1161 daemon_fence_pid = 0;
1162
1163 if (rv < 0) {
1164 /* shouldn't happen */
1165 log_error("fence wait %d pid %d error %d", nodeid, pid, rv);
1166 continue;
1167 }
1168
1169 log_debug("fence wait %d pid %d result %d", nodeid, pid, result);
1170
1171 if (!result) {
1172 /* agent exit 0, if there's another agent to run in
1173 parallel, set it to run next, otherwise success */
1174
1175 rv = fence_config_next_parallel(&node->fence_config);
1176 if (rv < 0) {
1177 send_fence_result(nodeid, 0, 0, time(NULL));
1178 node->fence_result_wait = 1;
1179 }
1180 } else {
1181 /* agent exit 1, if there's another agent to run at
1182 next priority, set it to run next, otherwise fail */
1183
1184 rv = fence_config_next_priority(&node->fence_config);
1185 if (rv < 0) {
1186 send_fence_result(nodeid, result, 0, time(NULL));
1187 node->fence_result_wait = 1;
1188 }
1189 }
1190 }
1191
1192 /*
1193 * clear fence_in_progress_unknown
1194 */
1195 out_fipu:
1196 if (opt(enable_startup_fencing_ind) &&
1197 fence_in_progress_unknown &&
1198 list_empty(&startup_nodes) &&
1199 !wait_clear_fipu &&
1200 !nodes_need_fencing()) {
1201 /*
1202 * case A in comment above
1203 * all nodes are starting and have fipu set, they all do
1204 * startup fencing, and eventually see unknown nodes become
1205 * members or get fenced, so all clear fipu for themselves.
1206 */
1207 fence_in_progress_unknown = 0;
1208 log_debug("fence_in_progress_unknown 0 startup");
1209 }
1210
1211 if (!fence_in_progress_unknown) {
1212 /*
1213 * case B in comment above
1214 * some cur nodes have fipu clear, new nodes have fipu set.
1215 * A current node needs to send_fence_clear to the new nodes
1216 * once all fencing is done so they clear fipu.
1217 */
1218 low = 0;
1219 need = 0;
1220
1221 list_for_each_entry(node, &daemon_nodes, list) {
1222 if (node->need_fencing)
1223 need++;
1224 if (!node->daemon_member || node->need_fence_clear)
1225 continue;
1226 if (!low || node->nodeid < low)
1227 low = node->nodeid;
1228 }
1229
1230 list_for_each_entry(node, &daemon_nodes, list) {
1231 if (!node->daemon_member || !node->need_fence_clear)
1232 continue;
1233 if (node->nodeid == our_nodeid) {
1234 node->need_fence_clear = 0;
1235 continue;
1236 }
1237 if (low != our_nodeid)
1238 continue;
1239
1240 flags = 0;
1241
1242 if (node->need_fence_clear & FR_CLEAR_STARTUP) {
1243 flags |= FR_CLEAR_STARTUP;
1244 node->need_fence_clear &= ~FR_CLEAR_STARTUP;
1245 }
1246
1247 if ((node->need_fence_clear & FR_CLEAR_FIPU) && !need) {
1248 flags |= FR_CLEAR_FIPU;
1249 node->need_fence_clear &= ~FR_CLEAR_FIPU;
1250 }
1251
1252 if (!flags)
1253 continue;
1254
1255 send_fence_clear(node->nodeid, 0, flags, 0);
1256 }
1257 }
1258
1259 if (!opt(enable_startup_fencing_ind) && fence_in_progress_unknown) {
1260 /*
1261 * case C in comment above
1262 * all nodes are starting and have fipu set. All expect a
1263 * previous node to send_fence_clear so they can clear fipu.
1264 * But there are no previous nodes. They need to detect this
1265 * condition. Each node does send_fence_clear(ENODATA,FIPU).
1266 * When all have received this from all, condition is
1267 * detected and all clear fipu.
1268 */
1269 if (all_daemon_members_fipu()) {
1270 fence_in_progress_unknown = 0;
1271 log_debug("fence_in_progress_unknown 0 all_fipu");
1272 } else if (last_join_seq > send_fipu_seq) {
1273 /* the seq numbers keep us from spamming this msg */
1274 send_fence_clear(our_nodeid, -ENODATA, FR_FIPU, 0);
1275 log_debug("send_fence_clear %d fipu", our_nodeid);
1276 send_fipu_seq = last_join_seq;
1277 }
1278 }
1279
1280 /*
1281 * clean up a zombie pid from an agent we killed
1282 */
1283
1284 if (zombie_count)
1285 clear_zombies();
1286
1287 /*
1288 * setting retry_fencing will cause the main daemon poll loop
1289 * to timeout in 1 second and call this function again.
1290 */
1291 out:
1292 if (retry)
1293 retry_fencing++;
1294 else
1295 retry_fencing = 0;
1296 }
1297
1298 void process_fencing_changes(void)
1299 {
1300 daemon_fence_work();
1301 }
1302
1303 static void receive_fence_clear(struct dlm_header *hd, int len)
1304 {
1305 struct fence_result *fr;
1306 struct node_daemon *node;
1307 int count;
1308
1309 fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header));
1310
1311 fr->flags = le32_to_cpu(fr->flags);
1312 fr->nodeid = le32_to_cpu(fr->nodeid);
1313 fr->result = le32_to_cpu(fr->result);
1314 fr->fence_walltime = le64_to_cpu(fr->fence_walltime);
1315
1316 if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) {
1317 log_error("receive_fence_clear invalid len %d from %d",
1318 len, hd->nodeid);
1319 return;
1320 }
1321
1322 node = get_node_daemon(fr->nodeid);
1323 if (!node) {
1324 log_error("receive_fence_clear from %d no daemon node %d",
1325 hd->nodeid, fr->nodeid);
1326 return;
1327 }
1328
1329 log_debug("receive_fence_clear from %d for %d result %d flags %x",
1330 hd->nodeid, fr->nodeid, fr->result, fr->flags);
1331
1332 /*
1333 * A node sends this message about itself indicating that it's in
1334 * startup with fipu set. The only time we care about node->fipu
1335 * is when all nodes are fipu in startup. node->need_fence_clear
1336 * and node->fipu are not related, they address different cases.
1337 */
1338 if ((fr->result == -ENODATA) && (fr->flags & FR_FIPU)) {
1339 if (!fence_in_progress_unknown)
1340 return;
1341
1342 node->fence_in_progress_unknown = 1;
1343 return;
1344 }
1345
1346 /*
1347 * An previous member sends this to new members to tell them that
1348 * they can clear startup_nodes and clear fipu. These two flags
1349 * may come in separate messages if there is a pending fencing op
1350 * when the new member joins (CLEAR_STARTUP will come right away,
1351 * but CLEAR_FIPU will come once the fencing op is done.)
1352 *
1353 * We need wait_clear_fipu after emptying startup_nodes to avoid
1354 * thinking we've finished startup fencing in case A below, and
1355 * clearing fipu ourselves.
1356 */
1357 if (!fr->result && (node->nodeid == our_nodeid)) {
1358 if ((fr->flags & FR_CLEAR_STARTUP) && !list_empty(&startup_nodes)) {
1359 count = clear_startup_node(0, 1);
1360 log_debug("clear_startup_nodes %d", count);
1361 wait_clear_fipu = 1;
1362 }
1363
1364 if ((fr->flags & FR_CLEAR_FIPU) && fence_in_progress_unknown) {
1365 fence_in_progress_unknown = 0;
1366 log_debug("fence_in_progress_unknown 0 recv");
1367 wait_clear_fipu = 0;
1368 }
1369 }
1370
1371 /* this node doesn't need these flags any more */
1372 if (!fr->result) {
1373 if (fr->flags & FR_CLEAR_STARTUP)
1374 node->need_fence_clear &= ~FR_CLEAR_STARTUP;
1375 if (fr->flags & FR_CLEAR_FIPU)
1376 node->need_fence_clear &= ~FR_CLEAR_FIPU;
1377 }
1378 }
1379
1380 static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime)
1381 {
1382 struct dlm_header *hd;
1383 struct fence_result *fr;
1384 char *buf;
1385 int len;
1386
1387 len = sizeof(struct dlm_header) + sizeof(struct fence_result);
1388 buf = malloc(len);
1389 if (!buf) {
1390 log_error("send_fence_clear no mem %d", len);
1391 return;
1392 }
1393 memset(buf, 0, len);
1394
1395 hd = (struct dlm_header *)buf;
1396 fr = (struct fence_result *)(buf + sizeof(*hd));
1397
1398 hd->type = cpu_to_le16(DLM_MSG_FENCE_CLEAR);
1399 hd->nodeid = cpu_to_le32(our_nodeid);
1400
1401 fr->flags = cpu_to_le32(flags);
1402 fr->nodeid = cpu_to_le32(nodeid);
1403 fr->result = cpu_to_le32(result);
1404 fr->fence_walltime = cpu_to_le64(walltime);
1405
1406 _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_CLEAR);
1407 }
1408
1409 static void receive_fence_result(struct dlm_header *hd, int len)
1410 {
1411 struct fence_result *fr;
1412 struct node_daemon *node;
1413 uint64_t now;
1414 int count;
1415
1416 fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header));
1417
1418 fr->flags = le32_to_cpu(fr->flags);
1419 fr->nodeid = le32_to_cpu(fr->nodeid);
1420 fr->result = le32_to_cpu(fr->result);
1421 fr->fence_walltime = le64_to_cpu(fr->fence_walltime);
1422
1423 if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) {
1424 log_error("receive_fence_result invalid len %d from %d",
1425 len, hd->nodeid);
1426 return;
1427 }
1428
1429 count = clear_startup_node(fr->nodeid, 0);
1430 if (count) {
1431 log_debug("receive_fence_result %d from %d clear startup",
1432 fr->nodeid, hd->nodeid);
1433 return;
1434 }
1435
1436 node = get_node_daemon(fr->nodeid);
1437 if (!node) {
1438 log_error("receive_fence_result %d from %d result %d no daemon node",
1439 fr->nodeid, hd->nodeid, fr->result);
1440 return;
1441 }
1442
1443 if (!node->need_fencing) {
1444 /* should never happen ... will happen if a manual fence_ack is
1445 done for a node that doesn't need it */
1446 log_error("receive_fence_result %d from %d result %d no need_fencing",
1447 fr->nodeid, hd->nodeid, fr->result);
1448 return;
1449 }
1450
1451 if ((hd->nodeid == our_nodeid) && !node->fence_result_wait && (fr->result != -ECANCELED)) {
1452 /* should never happen */
1453 log_error("receive_fence_result %d from %d result %d no fence_result_wait",
1454 fr->nodeid, hd->nodeid, fr->result);
1455 /* should we ignore and return here? */
1456 }
1457
1458 if (node->daemon_member &&
1459 (!fr->result || (fr->result == -ECANCELED))) {
1460
1461 /*
1462 * The node was successfully fenced, but is still a member.
1463 * This will happen when there is a partition, storage fencing
1464 * is started, a merge causes the node to become a member
1465 * again, and storage fencing completes successfully. If we
1466 * received a proto message from the node after the merge, then
1467 * we will have detected a stateful merge, and we may have
1468 * already killed it.
1469 */
1470
1471 log_error("receive_fence_result %d from %d result %d node is daemon_member",
1472 fr->nodeid, hd->nodeid, fr->result);
1473
1474 kick_node_from_cluster(fr->nodeid);
1475 }
1476
1477 if ((hd->nodeid == our_nodeid) && (fr->result != -ECANCELED))
1478 node->fence_result_wait = 0;
1479
1480 now = monotime();
1481
1482 log_error("fence status %d receive %d from %d walltime %llu local %llu",
1483 fr->nodeid, fr->result, hd->nodeid,
1484 (unsigned long long)fr->fence_walltime,
1485 (unsigned long long)now);
1486
1487 if (!fr->result || (fr->result == -ECANCELED)) {
1488 node->need_fencing = 0;
1489 node->delay_fencing = 0;
1490 node->fence_walltime = fr->fence_walltime;
1491 node->fence_monotime = now;
1492 node->fence_actor_done = hd->nodeid;
1493 } else {
1494 /* causes the next lowest nodeid to request fencing */
1495 clear_fence_actor(fr->nodeid, hd->nodeid);
1496 }
1497
1498 if ((fr->result == -ECANCELED) && node->fence_pid_wait && node->fence_pid) {
1499 fence_pid_cancel(node->nodeid, node->fence_pid);
1500
1501 node->fence_pid_wait = 0;
1502 node->fence_pid = 0;
1503 daemon_fence_pid = 0;
1504 }
1505 }
1506
1507 static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime)
1508 {
1509 struct dlm_header *hd;
1510 struct fence_result *fr;
1511 char *buf;
1512 int len;
1513
1514 len = sizeof(struct dlm_header) + sizeof(struct fence_result);
1515 buf = malloc(len);
1516 if (!buf) {
1517 log_error("send_fence_result no mem %d", len);
1518 return;
1519 }
1520 memset(buf, 0, len);
1521
1522 hd = (struct dlm_header *)buf;
1523 fr = (struct fence_result *)(buf + sizeof(*hd));
1524
1525 hd->type = cpu_to_le16(DLM_MSG_FENCE_RESULT);
1526 hd->nodeid = cpu_to_le32(our_nodeid);
1527
1528 fr->flags = cpu_to_le32(flags);
1529 fr->nodeid = cpu_to_le32(nodeid);
1530 fr->result = cpu_to_le32(result);
1531 fr->fence_walltime = cpu_to_le64(walltime);
1532
1533 _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_RESULT);
1534 }
1535
1536 void fence_ack_node(int nodeid)
1537 {
1538 send_fence_result(nodeid, -ECANCELED, 0, time(NULL));
1539 }
1540
1541 void set_protocol_stateful(void)
1542 {
1543 our_protocol.dr_ver.flags |= PV_STATEFUL;
1544 }
1545
1546 static void pv_in(struct protocol_version *pv)
1547 {
1548 pv->major = le16_to_cpu(pv->major);
1549 pv->minor = le16_to_cpu(pv->minor);
1550 pv->patch = le16_to_cpu(pv->patch);
1551 pv->flags = le16_to_cpu(pv->flags);
1552 }
1553
1554 static void pv_out(struct protocol_version *pv)
1555 {
1556 pv->major = cpu_to_le16(pv->major);
1557 pv->minor = cpu_to_le16(pv->minor);
1558 pv->patch = cpu_to_le16(pv->patch);
1559 pv->flags = cpu_to_le16(pv->flags);
1560 }
1561
1562 static void protocol_in(struct protocol *proto)
1563 {
1564 pv_in(&proto->dm_ver);
1565 pv_in(&proto->km_ver);
1566 pv_in(&proto->dr_ver);
1567 pv_in(&proto->kr_ver);
1568 }
1569
1570 static void protocol_out(struct protocol *proto)
1571 {
1572 pv_out(&proto->dm_ver);
1573 pv_out(&proto->km_ver);
1574 pv_out(&proto->dr_ver);
1575 pv_out(&proto->kr_ver);
1576 }
1577
1578 /* go through member list saved in last confchg, see if we have received a
1579 proto message from each */
1580
1581 static int all_protocol_messages(void)
1582 {
1583 struct node_daemon *node;
1584 int i;
1585
1586 if (!daemon_member_count)
1587 return 0;
1588
1589 for (i = 0; i < daemon_member_count; i++) {
1590 node = get_node_daemon(daemon_member[i].nodeid);
1591 if (!node) {
1592 log_error("all_protocol_messages no node %d",
1593 daemon_member[i].nodeid);
1594 return 0;
1595 }
1596
1597 if (!node->proto.daemon_max[0])
1598 return 0;
1599 }
1600 return 1;
1601 }
1602
1603 static int pick_min_protocol(struct protocol *proto)
1604 {
1605 uint16_t mind[4];
1606 uint16_t mink[4];
1607 struct node_daemon *node;
1608 int i;
1609
1610 memset(&mind, 0, sizeof(mind));
1611 memset(&mink, 0, sizeof(mink));
1612
1613 /* first choose the minimum major */
1614
1615 for (i = 0; i < daemon_member_count; i++) {
1616 node = get_node_daemon(daemon_member[i].nodeid);
1617 if (!node) {
1618 log_error("pick_min_protocol no node %d",
1619 daemon_member[i].nodeid);
1620 return -1;
1621 }
1622
1623 if (!mind[0] || node->proto.daemon_max[0] < mind[0])
1624 mind[0] = node->proto.daemon_max[0];
1625
1626 if (!mink[0] || node->proto.kernel_max[0] < mink[0])
1627 mink[0] = node->proto.kernel_max[0];
1628 }
1629
1630 if (!mind[0] || !mink[0]) {
1631 log_error("pick_min_protocol zero major number");
1632 return -1;
1633 }
1634
1635 /* second pick the minimum minor with the chosen major */
1636
1637 for (i = 0; i < daemon_member_count; i++) {
1638 node = get_node_daemon(daemon_member[i].nodeid);
1639 if (!node)
1640 continue;
1641
1642 if (mind[0] == node->proto.daemon_max[0]) {
1643 if (!mind[1] || node->proto.daemon_max[1] < mind[1])
1644 mind[1] = node->proto.daemon_max[1];
1645 }
1646
1647 if (mink[0] == node->proto.kernel_max[0]) {
1648 if (!mink[1] || node->proto.kernel_max[1] < mink[1])
1649 mink[1] = node->proto.kernel_max[1];
1650 }
1651 }
1652
1653 if (!mind[1] || !mink[1]) {
1654 log_error("pick_min_protocol zero minor number");
1655 return -1;
1656 }
1657
1658 /* third pick the minimum patch with the chosen major.minor */
1659
1660 for (i = 0; i < daemon_member_count; i++) {
1661 node = get_node_daemon(daemon_member[i].nodeid);
1662 if (!node)
1663 continue;
1664
1665 if (mind[0] == node->proto.daemon_max[0] &&
1666 mind[1] == node->proto.daemon_max[1]) {
1667 if (!mind[2] || node->proto.daemon_max[2] < mind[2])
1668 mind[2] = node->proto.daemon_max[2];
1669 }
1670
1671 if (mink[0] == node->proto.kernel_max[0] &&
1672 mink[1] == node->proto.kernel_max[1]) {
1673 if (!mink[2] || node->proto.kernel_max[2] < mink[2])
1674 mink[2] = node->proto.kernel_max[2];
1675 }
1676 }
1677
1678 if (!mind[2] || !mink[2]) {
1679 log_error("pick_min_protocol zero patch number");
1680 return -1;
1681 }
1682
1683 memcpy(&proto->daemon_run, &mind, sizeof(mind));
1684 memcpy(&proto->kernel_run, &mink, sizeof(mink));
1685 return 0;
1686 }
1687
1688 static void receive_protocol(struct dlm_header *hd, int len)
1689 {
1690 struct protocol *p;
1691 struct node_daemon *node;
1692 int new = 0;
1693
1694 p = (struct protocol *)((char *)hd + sizeof(struct dlm_header));
1695 protocol_in(p);
1696
1697 if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) {
1698 log_error("receive_protocol invalid len %d from %d",
1699 len, hd->nodeid);
1700 return;
1701 }
1702
1703 /* zero is an invalid version value */
1704
1705 if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] ||
1706 !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) {
1707 log_error("receive_protocol invalid max value from %d "
1708 "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
1709 p->daemon_max[0], p->daemon_max[1], p->daemon_max[2],
1710 p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]);
1711 return;
1712 }
1713
1714 /* the run values will be zero until a version is set, after
1715 which none of the run values can be zero */
1716
1717 if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] ||
1718 !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) {
1719 log_error("receive_protocol invalid run value from %d "
1720 "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid,
1721 p->daemon_run[0], p->daemon_run[1], p->daemon_run[2],
1722 p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]);
1723 return;
1724 }
1725
1726 /* save this node's proto so we can tell when we've got all, and
1727 use it to select a minimum protocol from all */
1728
1729 node = get_node_daemon(hd->nodeid);
1730 if (!node) {
1731 log_error("receive_protocol no node %d", hd->nodeid);
1732 return;
1733 }
1734
1735 if (!node->daemon_member) {
1736 log_error("receive_protocol node %d not member", hd->nodeid);
1737 return;
1738 }
1739
1740 log_debug("receive_protocol %d max %u.%u.%u.%x run %u.%u.%u.%x",
1741 hd->nodeid,
1742 p->daemon_max[0], p->daemon_max[1],
1743 p->daemon_max[2], p->daemon_max[3],
1744 p->daemon_run[0], p->daemon_run[1],
1745 p->daemon_run[2], p->daemon_run[3]);
1746
1747 if (memcmp(&node->proto, p, sizeof(struct protocol))) {
1748 log_debug("daemon node %d prot max %u.%u.%u.%x run %u.%u.%u.%x",
1749 hd->nodeid,
1750 node->proto.daemon_max[0], node->proto.daemon_max[1],
1751 node->proto.daemon_max[2], node->proto.daemon_max[3],
1752 node->proto.daemon_run[0], node->proto.daemon_run[1],
1753 node->proto.daemon_run[2], node->proto.daemon_run[3]);
1754 new = 1;
1755 }
1756
1757 /* checking zero node->daemon_max[0] is a way to tell if we've received
1758 an acceptable (non-stateful) proto message from the node since we
1759 saw it join the daemon cpg */
1760
1761 if (hd->nodeid != our_nodeid &&
1762 !node->proto.daemon_max[0] &&
1763 (p->dr_ver.flags & PV_STATEFUL) &&
1764 (our_protocol.dr_ver.flags & PV_STATEFUL)) {
1765
1766 log_error("daemon node %d stateful merge", hd->nodeid);
1767 log_debug("daemon node %d join %llu left %llu local quorum %llu killed %d",
1768 hd->nodeid,
1769 (unsigned long long)node->daemon_add_time,
1770 (unsigned long long)node->daemon_rem_time,
1771 (unsigned long long)cluster_quorate_monotime,
1772 node->killed);
1773
1774 node->stateful_merge = 1;
1775
1776 if (cluster_quorate && node->daemon_rem_time &&
1777 cluster_quorate_monotime < node->daemon_rem_time) {
1778 if (!node->killed) {
1779 if (cluster_two_node) {
1780 /*
1781 * When there are two nodes and two_node mode
1782 * is used, both will have quorum throughout
1783 * the partition and subsequent stateful merge.
1784 *
1785 * - both will race to fence each other in
1786 * response to the partition
1787 *
1788 * - both can attempt to kill the cluster
1789 * on the other in response to the stateful
1790 * merge here
1791 *
1792 * - we don't want both nodes to kill the cluster
1793 * on the other, which can happen if the merge
1794 * occurs before power fencing is successful,
1795 * or can happen before/during/after storage
1796 * fencing
1797 *
1798 * - if nodeA successfully fences nodeB (due
1799 * to the partition), we want nodeA to kill
1800 * the cluster on nodeB in response to the
1801 * merge (we don't want nodeB to kill nodeA
1802 * in response to the merge).
1803 *
1804 * So, a node that has successfully fenced the
1805 * other will kill the cluster on it. If fencing
1806 * is still running, we wait until it's
1807 * successfull to kill the cluster on the node
1808 * being fenced.
1809 */
1810 if (nodeid_needs_fencing(hd->nodeid)) {
1811 /* when fencing completes successfully,
1812 we'll see the node is a daemon member
1813 and kill it */
1814 log_debug("daemon node %d delay kill for stateful merge", hd->nodeid);
1815 } else {
1816 log_error("daemon node %d kill due to stateful merge", hd->nodeid);
1817 kick_node_from_cluster(hd->nodeid);
1818 }
1819 } else {
1820 log_error("daemon node %d kill due to stateful merge", hd->nodeid);
1821 kick_node_from_cluster(hd->nodeid);
1822 }
1823 }
1824 node->killed = 1;
1825 }
1826
1827 /* don't save p->proto into node->proto; we need to come
1828 through here based on zero daemon_max[0] for other proto
1829 messages like this one from the same node */
1830
1831 return;
1832 }
1833
1834 if (new) {
1835 memcpy(&node->proto, p, sizeof(struct protocol));
1836
1837 log_debug("daemon node %d save max %u.%u.%u.%x run %u.%u.%u.%x",
1838 node->nodeid,
1839 node->proto.daemon_max[0], node->proto.daemon_max[1],
1840 node->proto.daemon_max[2], node->proto.daemon_max[3],
1841 node->proto.daemon_run[0], node->proto.daemon_run[1],
1842 node->proto.daemon_run[2], node->proto.daemon_run[3]);
1843 }
1844
1845 /* if we have zero run values, and this msg has non-zero run values,
1846 then adopt them as ours; otherwise save this proto message */
1847
1848 if (our_protocol.daemon_run[0])
1849 return;
1850
1851 if (p->daemon_run[0]) {
1852 our_protocol.daemon_run[0] = p->daemon_run[0];
1853 our_protocol.daemon_run[1] = p->daemon_run[1];
1854 our_protocol.daemon_run[2] = p->daemon_run[2];
1855
1856 our_protocol.kernel_run[0] = p->kernel_run[0];
1857 our_protocol.kernel_run[1] = p->kernel_run[1];
1858 our_protocol.kernel_run[2] = p->kernel_run[2];
1859
1860 log_debug("run protocol from nodeid %d", hd->nodeid);
1861 }
1862 }
1863
1864 static void send_protocol(struct protocol *proto)
1865 {
1866 struct dlm_header *hd;
1867 struct protocol *pr;
1868 char *buf;
1869 int len;
1870
1871 len = sizeof(struct dlm_header) + sizeof(struct protocol);
1872 buf = malloc(len);
1873 if (!buf) {
1874 log_error("send_protocol no mem %d", len);
1875 return;
1876 }
1877 memset(buf, 0, len);
1878
1879 hd = (struct dlm_header *)buf;
1880 pr = (struct protocol *)(buf + sizeof(*hd));
1881
1882 hd->type = cpu_to_le16(DLM_MSG_PROTOCOL);
1883 hd->nodeid = cpu_to_le32(our_nodeid);
1884
1885 memcpy(pr, proto, sizeof(struct protocol));
1886 protocol_out(pr);
1887
1888 _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL);
1889 free(buf);
1890 }
1891
1892 int set_protocol(void)
1893 {
1894 struct protocol proto;
1895 struct pollfd pollfd;
1896 cs_error_t error;
1897 int sent_proposal = 0;
1898 int rv;
1899
1900 memset(&pollfd, 0, sizeof(pollfd));
1901 pollfd.fd = cpg_fd_daemon;
1902 pollfd.events = POLLIN;
1903
1904 while (1) {
1905 if (our_protocol.daemon_run[0])
1906 break;
1907
1908 if (!sent_proposal && all_protocol_messages()) {
1909 /* propose a protocol; look through info from all
1910 nodes and pick the min for both daemon and kernel,
1911 and propose that */
1912
1913 sent_proposal = 1;
1914
1915 /* copy our max values */
1916 memcpy(&proto, &our_protocol, sizeof(struct protocol));
1917
1918 rv = pick_min_protocol(&proto);
1919 if (rv < 0)
1920 return rv;
1921
1922 log_debug("set_protocol member_count %d propose "
1923 "daemon %u.%u.%u kernel %u.%u.%u",
1924 daemon_member_count,
1925 proto.daemon_run[0], proto.daemon_run[1],
1926 proto.daemon_run[2], proto.kernel_run[0],
1927 proto.kernel_run[1], proto.kernel_run[2]);
1928
1929 send_protocol(&proto);
1930 }
1931
1932 /* only process messages/events from daemon cpg until protocol
1933 is established */
1934
1935 rv = poll(&pollfd, 1, -1);
1936 if (rv == -1 && errno == EINTR) {
1937 if (daemon_quit)
1938 return -1;
1939 continue;
1940 }
1941 if (rv < 0) {
1942 log_error("set_protocol poll errno %d", errno);
1943 return -1;
1944 }
1945
1946 if (pollfd.revents & POLLIN) {
1947 /*
1948 * don't use process_cpg_daemon() because we only want to
1949 * dispatch one thing at a time because we only want to
1950 * handling protocol related things here.
1951 */
1952
1953 error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ONE);
1954 if (error != CS_OK && error != CS_ERR_BAD_HANDLE)
1955 log_error("daemon cpg_dispatch one error %d", error);
1956 }
1957 if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
1958 log_error("set_protocol poll revents %u",
1959 pollfd.revents);
1960 return -1;
1961 }
1962 }
1963
1964 if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] ||
1965 our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) {
1966 log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u",
1967 our_protocol.daemon_run[0],
1968 our_protocol.daemon_run[1],
1969 our_protocol.daemon_run[2],
1970 our_protocol.daemon_max[0],
1971 our_protocol.daemon_max[1],
1972 our_protocol.daemon_max[2]);
1973 return -1;
1974 }
1975
1976 if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] ||
1977 our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) {
1978 log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u",
1979 our_protocol.kernel_run[0],
1980 our_protocol.kernel_run[1],
1981 our_protocol.kernel_run[2],
1982 our_protocol.kernel_max[0],
1983 our_protocol.kernel_max[1],
1984 our_protocol.kernel_max[2]);
1985 return -1;
1986 }
1987
1988 log_debug("daemon run %u.%u.%u max %u.%u.%u "
1989 "kernel run %u.%u.%u max %u.%u.%u",
1990 our_protocol.daemon_run[0],
1991 our_protocol.daemon_run[1],
1992 our_protocol.daemon_run[2],
1993 our_protocol.daemon_max[0],
1994 our_protocol.daemon_max[1],
1995 our_protocol.daemon_max[2],
1996 our_protocol.kernel_run[0],
1997 our_protocol.kernel_run[1],
1998 our_protocol.kernel_run[2],
1999 our_protocol.kernel_max[0],
2000 our_protocol.kernel_max[1],
2001 our_protocol.kernel_max[2]);
2002
2003 send_protocol(&our_protocol);
2004 return 0;
2005 }
2006
2007 static void deliver_cb_daemon(cpg_handle_t handle,
2008 const struct cpg_name *group_name,
2009 uint32_t nodeid, uint32_t pid,
2010 void *data, size_t len)
2011 {
2012 struct dlm_header *hd;
2013
2014 if (len < sizeof(*hd)) {
2015 log_error("deliver_cb short message %zd", len);
2016 return;
2017 }
2018
2019 hd = (struct dlm_header *)data;
2020 dlm_header_in(hd);
2021
2022 if (!daemon_fence_allow && hd->type != DLM_MSG_PROTOCOL) {
2023 /* don't think this will happen; if it does we may
2024 need to verify that it's correct to ignore these
2025 messages instead of saving them to process after
2026 allow is set */
2027 log_debug("deliver_cb_daemon ignore non proto msg %d", hd->type);
2028 return;
2029 }
2030
2031 switch (hd->type) {
2032 case DLM_MSG_PROTOCOL:
2033 receive_protocol(hd, len);
2034 break;
2035 case DLM_MSG_FENCE_RESULT:
2036 receive_fence_result(hd, len);
2037 break;
2038 case DLM_MSG_FENCE_CLEAR:
2039 receive_fence_clear(hd, len);
2040 break;
2041 case DLM_MSG_RUN_REQUEST:
2042 receive_run_request(hd, len);
2043 break;
2044 case DLM_MSG_RUN_REPLY:
2045 receive_run_reply(hd, len);
2046 break;
2047 default:
2048 log_error("deliver_cb_daemon unknown msg type %d", hd->type);
2049 }
2050
2051 daemon_fence_work();
2052 }
2053
2054 int receive_run_reply(struct dlm_header *hd, int len)
2055 {
2056 struct run_reply *rep = (struct run_reply *)hd;
2057 struct run *run;
2058 int i;
2059
2060 run_reply_in(rep);
2061
2062 log_debug("receive_run_reply %s from %d result %d",
2063 rep->uuid, hd->nodeid, rep->info.local_result);
2064
2065 if (!opt(enable_helper_ind)) {
2066 log_debug("receive_run_reply %s helper not enabled", rep->uuid);
2067 return 0;
2068 }
2069
2070 run = find_run(rep->uuid);
2071 if (!run) {
2072 log_debug("receive_run_reply no uuid %s", rep->uuid);
2073 return 0;
2074 }
2075
2076 /*
2077 * Only the starting node keeps track of results.
2078 */
2079 if (run->info.start_nodeid != our_nodeid)
2080 return 0;
2081
2082 if (len != sizeof(struct run_reply)) {
2083 log_debug("receive_run_reply %s bad len %d expect %zu",
2084 rep->uuid, len, sizeof(struct run_reply));
2085 run->info.reply_count++;
2086 run->info.need_replies--;
2087 return 0;
2088 }
2089
2090 for (i = 0; i < run->node_count; i++) {
2091 if (run->node_results[i].nodeid != hd->nodeid)
2092 continue;
2093
2094 /* shouldn't happen? */
2095 if (run->node_results[i].replied)
2096 break;
2097
2098 run->node_results[i].result = rep->info.local_result;
2099 run->node_results[i].replied = 1;
2100
2101 if (rep->info.local_result)
2102 run->info.fail_count++;
2103
2104 run->info.reply_count++;
2105 run->info.need_replies--;
2106
2107 /*
2108 log_debug("run reply_count % need_replies %d fail_count %d",
2109 run->info.reply_count, run->info.need_replies, run->info.fail_count);
2110 */
2111 break;
2112 }
2113
2114 return 0;
2115 }
2116
2117 int receive_run_request(struct dlm_header *hd, int len)
2118 {
2119 struct run_request *req = (struct run_request *)hd;
2120 struct run *run = NULL;
2121
2122 run_request_in(req);
2123
2124 log_debug("receive_run_request %s from %d", req->uuid, hd->nodeid);
2125
2126 if (len != sizeof(struct run_request)) {
2127 log_debug("receive_run_request %s bad len %d", req->uuid, len);
2128 /* todo: send reply with failed */
2129 return 0;
2130 }
2131
2132 if (req->info.dest_nodeid && (req->info.dest_nodeid != our_nodeid))
2133 return 0;
2134
2135 if (req->info.start_nodeid == our_nodeid) {
2136 if (!(req->info.flags & DLMC_FLAG_RUN_START_NODE_RECV)) {
2137 log_debug("receive_run_request ignore self");
2138 return 0;
2139 }
2140
2141 if (!opt(enable_helper_ind)) {
2142 log_debug("receive_run_request %s helper not enabled", req->uuid);
2143 return 0;
2144 }
2145
2146 run = find_run(req->uuid);
2147 if (!run) {
2148 log_debug("receive_run_request from self no uuid %s", req->uuid);
2149 return 0;
2150 }
2151
2152 log_debug("receive_run_request %s to helper", req->uuid);
2153
2154 send_helper_run_request(req);
2155 return 0;
2156 }
2157
2158 if (!opt(enable_helper_ind) && run) {
2159 log_debug("receive_run_request %s helper not enabled", req->uuid);
2160 run->info.reply_count++;
2161 run->info.need_replies--;
2162 /* todo: send reply with failed */
2163 return 0;
2164 }
2165
2166 if (!(run = malloc(sizeof(struct run)))) {
2167 log_error("receive_run_request %s no mem", req->uuid);
2168 /* todo: send reply with failed */
2169 return 0;
2170 }
2171
2172 memset(run, 0, sizeof(struct run));
2173
2174 memcpy(run->uuid, req->uuid, RUN_UUID_LEN);
2175 memcpy(run->command, req->command, RUN_COMMAND_LEN);
2176 run->info.start_nodeid = req->info.start_nodeid;
2177 run->info.dest_nodeid = req->info.dest_nodeid;
2178 run->info.flags = req->info.flags;
2179
2180 list_add(&run->list, &run_ops);
2181
2182 log_error("run request %s %.128s", run->uuid, run->command);
2183
2184 log_debug("receive_run_request %s to helper", req->uuid);
2185
2186 send_helper_run_request(req);
2187 /* todo: if no helper, send reply with failed */
2188
2189 return 0;
2190 }
2191
2192 int send_run_request(struct run *run, struct run_request *req)
2193 {
2194 struct node_daemon *node;
2195 int i = 0;
2196 int rv;
2197
2198 list_for_each_entry(node, &daemon_nodes, list) {
2199 if (!node->daemon_member)
2200 continue;
2201
2202 /*
2203 * When this starting node does not run the command,
2204 * there is no reply for our nodeid.
2205 */
2206 if ((node->nodeid == our_nodeid) &&
2207 (run->info.flags & DLMC_FLAG_RUN_START_NODE_NONE))
2208 continue;
2209
2210 /*
2211 * The command is only run on one specific node, and
2212 * only a reply from that node is needed.
2213 */
2214 if (run->info.dest_nodeid && (node->nodeid != run->info.dest_nodeid))
2215 continue;
2216
2217 run->node_count++;
2218 run->node_results[i].nodeid = node->nodeid;
2219 i++;
2220 }
2221
2222 run->info.need_replies = run->node_count;
2223
2224 log_debug("send_run_request %s for %d nodes", req->uuid, run->node_count);
2225
2226 run_request_out(req);
2227
2228 rv = dlm_send_message_daemon((char *)req, sizeof(struct run_request));
2229
2230 return rv;
2231 }
2232
2233 int send_run_reply(struct run *run, struct run_reply *rep)
2234 {
2235 int rv;
2236
2237 log_debug("send_run_reply %s result %d", rep->uuid, rep->info.local_result);
2238
2239 run_reply_out(rep);
2240
2241 rv = dlm_send_message_daemon((char *)rep, sizeof(struct run_reply));
2242
2243 /*
2244 * If we are not the starting node, clear the run operation.
2245 */
2246 if (rep->info.start_nodeid != our_nodeid)
2247 clear_run(run);
2248
2249 return rv;
2250 }
2251
2252 static void confchg_cb_daemon(cpg_handle_t handle,
2253 const struct cpg_name *group_name,
2254 const struct cpg_address *member_list,
2255 size_t member_list_entries,
2256 const struct cpg_address *left_list,
2257 size_t left_list_entries,
2258 const struct cpg_address *joined_list,
2259 size_t joined_list_entries)
2260 {
2261 struct node_daemon *node;
2262 uint64_t now, now_wall;
2263 int nodedown = 0, procdown = 0, leave = 0;
2264 int check_joined_count = 0, check_remove_count = 0, check_member_count = 0;
2265 int we_joined = 0;
2266 int i, reason, low;
2267
2268 now = monotime();
2269 now_wall = time(NULL);
2270
2271 log_config(group_name, member_list, member_list_entries,
2272 left_list, left_list_entries,
2273 joined_list, joined_list_entries);
2274
2275 memset(&daemon_member, 0, sizeof(daemon_member));
2276 daemon_member_count = member_list_entries;
2277
2278 for (i = 0; i < member_list_entries; i++) {
2279 daemon_member[i] = member_list[i];
2280 /* add struct for nodes we've not seen before */
2281 add_node_daemon(member_list[i].nodeid);
2282 }
2283
2284 memset(&daemon_joined, 0, sizeof(daemon_joined));
2285 daemon_joined_count = joined_list_entries;
2286
2287 for (i = 0; i < joined_list_entries; i++) {
2288 daemon_joined[i] = joined_list[i];
2289 if (joined_list[i].nodeid == our_nodeid)
2290 we_joined = 1;
2291 }
2292
2293 memset(&daemon_remove, 0, sizeof(daemon_remove));
2294 daemon_remove_count = left_list_entries;
2295
2296 for (i = 0; i < left_list_entries; i++) {
2297 daemon_remove[i] = left_list[i];
2298
2299 if (left_list[i].reason == CPG_REASON_NODEDOWN)
2300 nodedown++;
2301 else if (left_list[i].reason == CPG_REASON_PROCDOWN)
2302 procdown++;
2303 else if (left_list[i].reason == CPG_REASON_LEAVE)
2304 leave++;
2305 }
2306
2307 if (nodedown || procdown || leave)
2308 log_debug("%s left reason nodedown %d procdown %d leave %d",
2309 group_name->value, nodedown, procdown, leave);
2310
2311 if (nodedown)
2312 daemon_ringid_wait = 1;
2313
2314 if (joined_list_entries)
2315 send_protocol(&our_protocol);
2316
2317 list_for_each_entry(node, &daemon_nodes, list) {
2318 if (in_daemon_list(node->nodeid, daemon_member, daemon_member_count)) {
2319 if (node->daemon_member)
2320 continue;
2321
2322 check_joined_count++;
2323
2324 /* node joined daemon cpg */
2325 node->daemon_member = 1;
2326 node->daemon_add_time = now;
2327
2328 fence_delay_begin = now;
2329 last_join_seq++;
2330
2331 /* a joining node shows prev members in joined list */
2332 if (!we_joined)
2333 node->need_fence_clear = FR_CLEAR_STARTUP|FR_CLEAR_FIPU;
2334
2335 if (node->need_fencing) {
2336 /* need_fencing will be cleared if we accept a
2337 valid proto from it (is_clean_daemon_member) */
2338 log_error("daemon joined %d needs fencing", node->nodeid);
2339 } else {
2340 log_debug("daemon joined %d", node->nodeid);
2341 }
2342 } else {
2343 if (!node->daemon_member)
2344 continue;
2345
2346 check_remove_count++;
2347
2348 /* node left daemon cpg */
2349 node->daemon_member = 0;
2350 node->daemon_rem_time = now;
2351 node->killed = 0;
2352 node->stateful_merge = 0;
2353
2354 /* If we never accepted a valid proto from this node,
2355 then it never fully joined and there's no need to
2356 recover it. Similary, node_history_lockspace_fail
2357 only sets need_fencing in the lockspace if
2358 node->start_time was non-zero. */
2359
2360 if (node->proto.daemon_max[0]) {
2361 /* tell loop below to look at this node */
2362 node->recover_setup = 1;
2363 } else {
2364 log_debug("daemon remove %d no proto skip recover", node->nodeid);
2365 }
2366
2367 memset(&node->proto, 0, sizeof(struct protocol));
2368 }
2369 }
2370
2371 list_for_each_entry(node, &daemon_nodes, list) {
2372 if (node->daemon_member)
2373 check_member_count++;
2374 }
2375
2376 /* when we join, all previous members look like they are joining */
2377 if (!we_joined &&
2378 (daemon_joined_count != check_joined_count ||
2379 daemon_remove_count != check_remove_count ||
2380 daemon_member_count != check_member_count)) {
2381 log_error("daemon counts joined %d check %d remove %d check %d member %d check %d",
2382 daemon_joined_count, check_joined_count,
2383 daemon_remove_count, check_remove_count,
2384 daemon_member_count, check_member_count);
2385 }
2386
2387 /* set up recovery work for nodes that just failed (recover_setup set above) */
2388
2389 list_for_each_entry(node, &daemon_nodes, list) {
2390 if (!node->recover_setup)
2391 continue;
2392
2393 node->recover_setup = 0;
2394 reason = 0;
2395 low = 0;
2396
2397 if (!opt(enable_fencing_ind))
2398 continue;
2399
2400 if (node->need_fencing) {
2401 log_error("daemon remove %d already needs fencing", node->nodeid);
2402 continue;
2403 }
2404
2405 for (i = 0; i < left_list_entries; i++) {
2406 if (left_list[i].nodeid != node->nodeid)
2407 continue;
2408 reason = left_list[i].reason;
2409 break;
2410 }
2411
2412 if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) {
2413 if (node->fence_pid_wait || node->fence_pid) {
2414 /* sanity check, should never happen */
2415 log_error("daemon remove %d pid_wait %d pid %d",
2416 node->nodeid, node->fence_pid_wait, node->fence_pid);
2417 }
2418
2419 node->need_fencing = 1;
2420 node->delay_fencing = 0;
2421 node->fence_monotime = 0;
2422 node->fence_walltime = 0;
2423 node->fence_actor_last = 0;
2424 node->fence_actor_done = 0;
2425 node->fence_pid_wait = 0;
2426 node->fence_pid = 0;
2427 node->fence_result_wait = 0;
2428 node->fence_config.pos = 0;
2429 node->left_reason = reason;
2430 node->fail_monotime = now;
2431 node->fail_walltime = now_wall;
2432 low = set_fence_actors(node, 0);
2433 }
2434
2435 log_debug("daemon remove %d %s need_fencing %d low %d",
2436 node->nodeid, reason_str(reason), node->need_fencing, low);
2437 }
2438
2439 daemon_fence_work();
2440 }
2441
2442 static void totem_cb_daemon(cpg_handle_t handle,
2443 struct cpg_ring_id ring_id,
2444 uint32_t member_list_entries,
2445 const uint32_t *member_list)
2446 {
2447 daemon_ringid.nodeid = ring_id.nodeid;
2448 daemon_ringid.seq = ring_id.seq;
2449 daemon_ringid_wait = 0;
2450
2451 log_ringid("dlm:controld", &ring_id, member_list, member_list_entries);
2452
2453 daemon_fence_work();
2454 }
2455
2456 static cpg_model_v1_data_t cpg_callbacks_daemon = {
2457 .cpg_deliver_fn = deliver_cb_daemon,
2458 .cpg_confchg_fn = confchg_cb_daemon,
2459 .cpg_totem_confchg_fn = totem_cb_daemon,
2460 .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF,
2461 };
2462
2463 void process_cpg_daemon(int ci)
2464 {
2465 cs_error_t error;
2466
2467 error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL);
2468 if (error != CS_OK && error != CS_ERR_BAD_HANDLE)
2469 log_error("daemon cpg_dispatch error %d", error);
2470 }
2471
2472 int setup_cpg_daemon(void)
2473 {
2474 cs_error_t error;
2475 struct cpg_name name;
2476 int i = 0;
2477
2478 /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible
2479 with cluster4/RHEL7 */
2480
2481 memset(&our_protocol, 0, sizeof(our_protocol));
2482
2483 if (opt(enable_fscontrol_ind))
2484 our_protocol.daemon_max[0] = 2;
2485 else
2486 our_protocol.daemon_max[0] = 3;
2487
2488 our_protocol.daemon_max[1] = 1;
2489 our_protocol.daemon_max[2] = 1;
2490 our_protocol.kernel_max[0] = 1;
2491 our_protocol.kernel_max[1] = 1;
2492 our_protocol.kernel_max[2] = 1;
2493
2494 error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1,
2495 (cpg_model_data_t *)&cpg_callbacks_daemon,
2496 NULL);
2497 if (error != CS_OK) {
2498 log_error("daemon cpg_initialize error %d", error);
2499 return -1;
2500 }
2501
2502 cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon);
2503
2504 memset(&name, 0, sizeof(name));
2505 sprintf(name.value, "dlm:controld");
2506 name.length = strlen(name.value) + 1;
2507
2508 log_debug("cpg_join %s ...", name.value);
2509 retry:
2510 error = cpg_join(cpg_handle_daemon, &name);
2511 if (error == CS_ERR_TRY_AGAIN) {
2512 sleep(1);
2513 if (!(++i % 10))
2514 log_error("daemon cpg_join error retrying");
2515 goto retry;
2516 }
2517 if (error != CS_OK) {
2518 log_error("daemon cpg_join error %d", error);
2519 goto fail;
2520 }
2521
2522 log_debug("setup_cpg_daemon %d", cpg_fd_daemon);
2523 return cpg_fd_daemon;
2524
2525 fail:
2526 cpg_finalize(cpg_handle_daemon);
2527 return -1;
2528 }
2529
2530 static void stop_lockspaces(void)
2531 {
2532 struct lockspace *ls;
2533
2534 list_for_each_entry(ls, &lockspaces, list) {
2535 cpg_stop_kernel(ls);
2536 }
2537 }
2538
2539 void close_cpg_daemon(void)
2540 {
2541 struct lockspace *ls;
2542 cs_error_t error;
2543 struct cpg_name name;
2544 int i = 0;
2545
2546 if (!cpg_handle_daemon) {
2547 stop_lockspaces();
2548 return;
2549 }
2550
2551 if (cluster_down)
2552 goto fin;
2553
2554 memset(&name, 0, sizeof(name));
2555 sprintf(name.value, "dlm:controld");
2556 name.length = strlen(name.value) + 1;
2557
2558 log_debug("cpg_leave %s ...", name.value);
2559 retry:
2560 error = cpg_leave(cpg_handle_daemon, &name);
2561 if (error == CS_ERR_TRY_AGAIN) {
2562 sleep(1);
2563 if (!(++i % 10))
2564 log_error("daemon cpg_leave error retrying");
2565 goto retry;
2566 }
2567 if (error != CS_OK)
2568 log_error("daemon cpg_leave error %d", error);
2569 fin:
2570 list_for_each_entry(ls, &lockspaces, list) {
2571 /* stop kernel ls lock activity before configfs cleanup */
2572 cpg_stop_kernel(ls);
2573 if (ls->cpg_handle)
2574 cpg_finalize(ls->cpg_handle);
2575 }
2576 cpg_finalize(cpg_handle_daemon);
2577 }
2578
2579 void init_daemon(void)
2580 {
2581 INIT_LIST_HEAD(&daemon_nodes);
2582 INIT_LIST_HEAD(&startup_nodes);
2583
2584 }
2585
2586 static int print_state_daemon_node(struct node_daemon *node, char *str)
2587 {
2588 snprintf(str, DLMC_STATE_MAXSTR-1,
2589 "member=%d "
2590 "killed=%d "
2591 "left_reason=%s "
2592 "need_fencing=%d "
2593 "delay_fencing=%d "
2594 "fence_pid=%d "
2595 "fence_pid_wait=%d "
2596 "fence_result_wait=%d "
2597 "fence_actor_last=%d "
2598 "fence_actor_done=%d "
2599 "add_time=%llu "
2600 "rem_time=%llu "
2601 "fail_walltime=%llu "
2602 "fail_monotime=%llu "
2603 "fence_walltime=%llu "
2604 "fence_monotime=%llu ",
2605 node->daemon_member,
2606 node->killed,
2607 reason_str(node->left_reason),
2608 node->need_fencing,
2609 node->delay_fencing,
2610 node->fence_pid,
2611 node->fence_pid_wait,
2612 node->fence_result_wait,
2613 node->fence_actor_last,
2614 node->fence_actor_done,
2615 (unsigned long long)node->daemon_add_time,
2616 (unsigned long long)node->daemon_rem_time,
2617 (unsigned long long)node->fail_walltime,
2618 (unsigned long long)node->fail_monotime,
2619 (unsigned long long)node->fence_walltime,
2620 (unsigned long long)node->fence_monotime);
2621
2622 return strlen(str) + 1;
2623 }
2624
2625 void send_state_daemon_nodes(int fd)
2626 {
2627 struct node_daemon *node;
2628 struct dlmc_state st;
2629 char str[DLMC_STATE_MAXSTR];
2630 int str_len;
2631
2632 list_for_each_entry(node, &daemon_nodes, list) {
2633 memset(&st, 0, sizeof(st));
2634 st.type = DLMC_STATE_DAEMON_NODE;
2635 st.nodeid = node->nodeid;
2636
2637 memset(str, 0, sizeof(str));
2638 str_len = print_state_daemon_node(node, str);
2639
2640 st.str_len = str_len;
2641
2642 send(fd, &st, sizeof(st), MSG_NOSIGNAL);
2643 if (str_len)
2644 send(fd, str, str_len, MSG_NOSIGNAL);
2645 }
2646 }
2647
2648 void send_state_startup_nodes(int fd)
2649 {
2650 struct node_daemon *node;
2651 struct dlmc_state st;
2652 char str[DLMC_STATE_MAXSTR];
2653 int str_len;
2654
|
(1) Event cond_true: |
Condition "!(&node->list == &startup_nodes)", taking true branch. |
2655 list_for_each_entry(node, &startup_nodes, list) {
2656 memset(&st, 0, sizeof(st));
2657 st.type = DLMC_STATE_STARTUP_NODE;
2658 st.nodeid = node->nodeid;
2659
2660 memset(str, 0, sizeof(str));
2661 str_len = print_state_daemon_node(node, str);
2662
2663 st.str_len = str_len;
2664
|
(2) Event check_return: |
Calling "send(fd, &st, 28UL, MSG_NOSIGNAL)" without checking return value. This library function may fail and return an error code. |
2665 send(fd, &st, sizeof(st), MSG_NOSIGNAL);
2666 if (str_len)
2667 send(fd, str, str_len, MSG_NOSIGNAL);
2668 }
2669 }
2670
2671 static int print_state_daemon(char *str)
2672 {
2673 snprintf(str, DLMC_STATE_MAXSTR-1,
2674 "member_count=%d "
2675 "joined_count=%d "
2676 "remove_count=%d "
2677 "daemon_ringid=%llu "
2678 "cluster_ringid=%llu "
2679 "quorate=%d "
2680 "fence_pid=%d "
2681 "fence_in_progress_unknown=%d "
2682 "zombie_count=%d "
2683 "monotime=%llu "
2684 "stateful_merge_wait=%d ",
2685 daemon_member_count,
2686 daemon_joined_count,
2687 daemon_remove_count,
2688 (unsigned long long)daemon_ringid.seq,
2689 (unsigned long long)cluster_ringid_seq,
2690 cluster_quorate,
2691 daemon_fence_pid,
2692 fence_in_progress_unknown,
2693 zombie_count,
2694 (unsigned long long)monotime(),
2695 stateful_merge_wait);
2696
2697 return strlen(str) + 1;
2698 }
2699
2700 void send_state_daemon(int fd)
2701 {
2702 struct dlmc_state st;
2703 char str[DLMC_STATE_MAXSTR];
2704 int str_len;
2705
2706 memset(&st, 0, sizeof(st));
2707 st.type = DLMC_STATE_DAEMON;
2708 st.nodeid = our_nodeid;
2709
2710 memset(str, 0, sizeof(str));
2711 str_len = print_state_daemon(str);
2712
2713 st.str_len = str_len;
2714
2715 send(fd, &st, sizeof(st), MSG_NOSIGNAL);
2716 if (str_len)
2717 send(fd, str, str_len, MSG_NOSIGNAL);
2718 }
2719
2720