1 #!@PYTHON@ -tt
2
3 import sys, re
4 import logging
5 import atexit
6 import time
7 sys.path.append("@FENCEAGENTSLIBDIR@")
8 from fencing import *
9 from fencing import fail, fail_usage, run_delay, EC_STATUS, SyslogLibHandler
10
11 import requests
12 from requests import HTTPError
13
14 try:
15 import boto3
16 from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError, ParamValidationError
17 except ImportError:
18 pass
19
20 logger = logging.getLogger()
21 logger.propagate = False
22 logger.setLevel(logging.INFO)
23 logger.addHandler(SyslogLibHandler())
24 logging.getLogger('botocore.vendored').propagate = False
25
26 status = {
27 "running": "on",
28 "stopped": "off",
29 "pending": "unknown",
30 "stopping": "off",
31 "shutting-down": "off",
32 "terminated": "off"
33 }
34
35 # IMDSv2 endpoints. Timeout is (connect, read) in seconds; the fence path must
36 # never block on an unreachable metadata service.
37 IMDS_TOKEN_URL = "http://169.254.169.254/latest/api/token"
38 IMDS_META_URL = "http://169.254.169.254/latest/meta-data"
39 IMDS_TIMEOUT = (2, 5)
40
41 def _imds_fetch(path, options):
42 """Fetch a single IMDSv2 metadata path. Returns the value as str, or None on any failure."""
43 try:
44 token = requests.put(
|
CID (unavailable; MK=242552740d15d45fbb869d9cc0b90b81) (#1 of 1): Missing TLS (SIGMA.missing_tls): |
|
(1) Event Sigma main event: |
The Python application creates a connection to the URL using the insecure HTTP protocol. As a result, application data is transmitted over an insecure channel where it can be read and modified by attackers. |
|
(2) Event remediation: |
Modify the URL passed to the `requests` method to use the `https://` protocol. |
45 IMDS_TOKEN_URL,
46 headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
47 timeout=IMDS_TIMEOUT).content.decode("UTF-8")
48 return requests.get(
49 "{}/{}".format(IMDS_META_URL, path),
50 headers={"X-aws-ec2-metadata-token": token},
51 timeout=IMDS_TIMEOUT).content.decode("UTF-8")
52 except HTTPError as http_err:
53 logger.error("HTTP error accessing EC2 metadata (%s): %s", path, http_err)
54 except Exception as err:
55 if "--skip-race-check" not in options:
56 logger.error("Error accessing EC2 metadata (%s): %s", path, err)
57 else:
58 logger.debug("Error accessing EC2 metadata (%s): %s", path, err)
59 return None
60
61 def get_instance_id(options):
62 return _imds_fetch("instance-id", options)
63
64
65 def get_instance_by_tag(conn, tag_name, tag_value, options, max_retries=3, retry_delay=2):
66 """
67 Look up EC2 instance ID by tag name and value.
68 Returns instance ID if found, None otherwise.
69
70 Includes retry logic for AWS API eventual consistency.
71
72 For blue/green deployments with multiple running instances:
73 1. Get current instance's build_number tag (from cached values)
74 2. Filter for non-terminated instances with matching tag
75 3. Prefer instance with SAME build_number as current instance
76 4. If no build_number match and multiple instances, REFUSE to guess
77 """
78 last_error = None
79
80 for attempt in range(1, max_retries + 1):
81 try:
82 region = options.get("--region")
83 logger.debug("Looking up instance by tag %s=%s in region %s (attempt %d/%d)",
84 tag_name, tag_value, region, attempt, max_retries)
85
86 my_build_number = options.get("my_build_number")
87
88 filters = [
89 {"Name": "tag:{}".format(tag_name), "Values": [tag_value]},
90 {"Name": "instance-state-name", "Values": ["pending", "running", "stopping", "stopped"]}
91 ]
92 instances = list(conn.instances.filter(Filters=filters))
93
94 if not instances:
95 if attempt < max_retries:
96 logger.debug("No instance found with tag %s=%s in fenceable states (attempt %d/%d), retrying in %ds",
97 tag_name, tag_value, attempt, max_retries, retry_delay)
98 time.sleep(retry_delay)
99 continue
100 logger.warning("No instance found with tag %s=%s in fenceable states after %d attempts",
101 tag_name, tag_value, max_retries)
102 return None
103
104 if len(instances) > 1:
105 logger.warning("Multiple running instances found with tag %s=%s:", tag_name, tag_value)
106
107 instances_with_build = []
108 for inst in instances:
109 build_num = None
110 if inst.tags:
111 for tag in inst.tags:
112 if tag['Key'] == 'build_number':
113 build_num = tag['Value']
114 break
115 instances_with_build.append((inst, build_num))
116 logger.warning(" - %s (build_number=%s, launched=%s)",
117 inst.id, build_num if build_num else "N/A", inst.launch_time)
118
119 selected_inst = None
120 if my_build_number:
121 for inst, build_num in instances_with_build:
122 if build_num == my_build_number:
123 selected_inst = inst
124 logger.warning("Selecting instance with matching build_number=%s: %s",
125 my_build_number, inst.id)
126 break
127
128 if not selected_inst:
129 logger.error("Multiple instances match tag %s=%s but none match build_number=%s. "
130 "Refusing to guess. Manual intervention required.",
131 tag_name, tag_value, my_build_number)
132 return None
133
134 instance_id = selected_inst.id
135 else:
136 instance_id = instances[0].id
137 logger.debug("Single instance found: %s", instance_id)
138
139 logger.debug("Selected instance %s with tag %s=%s", instance_id, tag_name, tag_value)
140 return instance_id
141
142 except (ClientError, EndpointConnectionError, ConnectionError) as e:
143 last_error = e
144 if attempt < max_retries:
145 logger.warning("AWS API error during tag lookup (attempt %d/%d): %s. Retrying in %ds",
146 attempt, max_retries, e, retry_delay)
147 time.sleep(retry_delay)
148 continue
149 logger.error("Failed to lookup instance by tag after %d attempts: %s", max_retries, e)
150 return None
151
152 logger.error("Failed to lookup instance by tag after %d attempts: %s", max_retries, last_error)
153 return None
154
155
156 def get_instance_by_eni(conn, eni_id, max_retries=3, retry_delay=2):
157 """
158 Resolve ENI ID to attached instance ID.
159 Returns (instance_id, None) on success or (None, error_msg) on failure.
160
161 When the ENI exists but is not attached, the target instance is gone
162 (terminated or being replaced). The caller decides the semantics:
163 - get_power_status treats "not attached" as OFF (safe)
164 - set_power_status fails safe when the target cannot be resolved
165 """
166 last_error = None
167
168 for attempt in range(1, max_retries + 1):
169 try:
170 client = conn.meta.client
171 response = client.describe_network_interfaces(NetworkInterfaceIds=[eni_id])
172 enis = response.get('NetworkInterfaces', [])
173
174 if not enis:
175 return (None, "ENI {} not found".format(eni_id))
176
177 eni = enis[0]
178 eni_status = eni.get('Status', 'unknown')
179 attachment = eni.get('Attachment')
180
181 if not attachment:
182 return (None, "ENI {} exists but not attached (status: {})".format(eni_id, eni_status))
183
184 attach_status = attachment.get('Status', 'unknown')
185 if attach_status not in ('attached', 'attaching'):
186 return (None, "ENI {} attachment in transitional state: {}".format(eni_id, attach_status))
187
188 instance_id = attachment.get('InstanceId')
189 if not instance_id:
190 return (None, "ENI {} attached but no InstanceId in response".format(eni_id))
191
192 logger.debug("ENI %s is attached to instance %s", eni_id, instance_id)
193 return (instance_id, None)
194
195 except ClientError as e:
196 error_code = e.response.get('Error', {}).get('Code', '')
197 if error_code == 'InvalidNetworkInterfaceID.NotFound':
198 return (None, "ENI {} does not exist".format(eni_id))
199 last_error = e
200 if attempt < max_retries:
201 logger.warning("AWS API error during ENI lookup (attempt %d/%d): %s. Retrying in %ds",
202 attempt, max_retries, e, retry_delay)
203 time.sleep(retry_delay)
204 continue
205 return (None, "AWS API error after {} attempts: {}".format(max_retries, e))
206 except (EndpointConnectionError, ConnectionError) as e:
207 last_error = e
208 if attempt < max_retries:
209 logger.warning("Connection error during ENI lookup (attempt %d/%d): %s. Retrying in %ds",
210 attempt, max_retries, e, retry_delay)
211 time.sleep(retry_delay)
212 continue
213 return (None, "Connection error after {} attempts: {}".format(max_retries, e))
214
215 return (None, "ENI lookup failed after {} attempts: {}".format(max_retries, last_error))
216
217
218 def get_instance_by_volume(conn, volume_id, max_retries=3, retry_delay=2):
219 """
220 Resolve EBS volume ID to attached instance ID.
221 Returns (instance_id, None) on success or (None, error_msg) on failure.
222
223 When the volume exists but is not attached, the target instance is gone
224 (terminated or being replaced). Same caller semantics as ENI resolution.
225 """
226 last_error = None
227
228 for attempt in range(1, max_retries + 1):
229 try:
230 client = conn.meta.client
231 response = client.describe_volumes(VolumeIds=[volume_id])
232 volumes = response.get('Volumes', [])
233
234 if not volumes:
235 return (None, "Volume {} not found".format(volume_id))
236
237 volume = volumes[0]
238 vol_state = volume.get('State', 'unknown')
239 attachments = volume.get('Attachments', [])
240
241 if not attachments:
242 return (None, "Volume {} exists but not attached (state: {})".format(volume_id, vol_state))
243
244 if len(attachments) > 1:
245 return (None, "Volume {} has {} attachments (multi-attach). Cannot determine target.".format(
246 volume_id, len(attachments)))
247
248 attach = attachments[0]
249 attach_status = attach.get('State', 'unknown')
250 if attach_status not in ('attached', 'attaching'):
251 return (None, "Volume {} attachment in transitional state: {}".format(volume_id, attach_status))
252
253 instance_id = attach.get('InstanceId')
254 if not instance_id:
255 return (None, "Volume {} attached but no InstanceId in response".format(volume_id))
256
257 logger.debug("Volume %s is attached to instance %s", volume_id, instance_id)
258 return (instance_id, None)
259
260 except ClientError as e:
261 error_code = e.response.get('Error', {}).get('Code', '')
262 if error_code == 'InvalidVolume.NotFound':
263 return (None, "Volume {} does not exist".format(volume_id))
264 last_error = e
265 if attempt < max_retries:
266 logger.warning("AWS API error during EBS lookup (attempt %d/%d): %s. Retrying in %ds",
267 attempt, max_retries, e, retry_delay)
268 time.sleep(retry_delay)
269 continue
270 return (None, "AWS API error after {} attempts: {}".format(max_retries, e))
271 except (EndpointConnectionError, ConnectionError) as e:
272 last_error = e
273 if attempt < max_retries:
274 logger.warning("Connection error during EBS lookup (attempt %d/%d): %s. Retrying in %ds",
275 attempt, max_retries, e, retry_delay)
276 time.sleep(retry_delay)
277 continue
278 return (None, "Connection error after {} attempts: {}".format(max_retries, e))
279
280 return (None, "EBS lookup failed after {} attempts: {}".format(max_retries, last_error))
281
282
283 def resolve_plug_to_instance_id(conn, options):
284 """
285 Resolve the --plug parameter to an instance ID.
286
287 Dispatches by --identity-method:
288 instance-id (default): --plug is the instance ID directly
289 tag: --plug is a tag value, resolved via DescribeInstances
290 eni: --plug is an ENI ID, resolved via DescribeNetworkInterfaces
291 ebs: --plug is a volume ID, resolved via DescribeVolumes
292
293 Uses a per-invocation cache to minimise AWS control plane calls.
294 The identity -> instance ID mapping is resolved ONCE and reused for all
295 subsequent calls within the same fence operation (status checks,
296 power actions, polling).
297
298 The cache is safe because:
299 - The instance ID cannot change while the instance is being fenced
300 - The agent process is short-lived (one fence operation per invocation)
301 - If the cached ID becomes invalid (instance terminated between calls),
302 the StopInstances call fails gracefully with InvalidInstanceID.NotFound
303 """
304 plug_value = options.get("--plug")
305 identity_method = options.get("--identity-method", "instance-id")
306
307 if not plug_value:
308 logger.error("No --plug parameter provided")
309 return None
310
311 cache_key = "cached_instance_id"
312 cached = options.get(cache_key)
313 if cached:
314 logger.debug("Using cached instance ID %s for plug=%s", cached, plug_value)
315 return cached
316
317 instance_id = None
318
319 if identity_method == "eni":
320 logger.debug("ENI-based lookup: %s", plug_value)
321 instance_id, error = get_instance_by_eni(conn, plug_value)
322 if error:
323 logger.error("ENI resolution failed: %s", error)
324 return None
325
326 elif identity_method == "ebs":
327 logger.debug("EBS-based lookup: %s", plug_value)
328 instance_id, error = get_instance_by_volume(conn, plug_value)
329 if error:
330 logger.error("EBS resolution failed: %s", error)
331 return None
332
333 elif identity_method == "tag" or options.get("--tag"):
334 tag_name = options.get("--tag", "Name")
335 logger.debug("Tag-based lookup: %s=%s", tag_name, plug_value)
336 instance_id = get_instance_by_tag(conn, tag_name, plug_value, options)
337 if not instance_id:
338 logger.error("Failed to find instance with tag %s=%s", tag_name, plug_value)
339
340 else:
341 logger.debug("Direct instance ID: %s", plug_value)
342 instance_id = plug_value
343
344 if instance_id:
345 options[cache_key] = instance_id
346 logger.debug("Resolved plug=%s to instance %s (method: %s)",
347 plug_value, instance_id, identity_method)
348
349 return instance_id
350
351
352 def check_tag_target_is_dead(conn, options):
353 """
354 When tag lookup returns no fenceable instances, determine whether the
355 target is genuinely dead (all terminated) or if the lookup failed for
356 other reasons (wrong tag, API error, etc.).
357
358 Returns True if the target is confirmed dead, False otherwise.
359 """
360 tag_name = options.get("--tag")
361 plug_value = options.get("--plug")
362
363 if not tag_name:
364 return False
365
366 try:
367 all_states = list(conn.instances.filter(Filters=[
368 {"Name": "tag:{}".format(tag_name), "Values": [plug_value]}
369 ]))
370
371 if all_states and all(i.state["Name"] in ("terminated", "shutting-down") for i in all_states):
372 logger.info("All instances with tag %s=%s are terminated/shutting-down. Target confirmed dead.",
373 tag_name, plug_value)
374 return True
375
376 if not all_states:
377 logger.error("No instance has ever existed with tag %s=%s. This is a configuration error.",
378 tag_name, plug_value)
379 return False
380
381 live_states = [i.state["Name"] for i in all_states if i.state["Name"] not in ("terminated", "shutting-down")]
382 logger.error("Instances with tag %s=%s exist in unexpected states: %s. Cannot confirm target is dead.",
383 tag_name, plug_value, live_states)
384 return False
385
386 except (ClientError, EndpointConnectionError, ConnectionError) as e:
387 logger.error("AWS API error during dead-target check: %s", e)
388 return False
389
390
391 def get_nodes_list(conn, options):
392 logger.debug("Starting monitor operation")
393 result = {}
394 filter = []
395 try:
396 tag_name = options.get("--tag")
397
398 if "--filter" in options:
399 filter_key = options["--filter"].split("=")[0].strip()
400 filter_value = options["--filter"].split("=")[1].strip()
401 filter = [{ "Name": filter_key, "Values": [filter_value] }]
402 logging.debug("Filter: {}".format(filter))
403
404 for instance in conn.instances.filter(Filters=filter):
405 instance_name = ""
406 for tag in instance.tags or []:
407 if tag.get("Key") == "Name":
408 instance_name = tag["Value"]
409 break
410
411 port_name = instance.id
412 if tag_name and instance.tags:
413 for tag in instance.tags:
414 if tag['Key'] == tag_name:
415 port_name = tag['Value']
416 logger.debug("Mapped instance %s to port name %s via tag %s",
417 instance.id, port_name, tag_name)
418 break
419
420 try:
421 result[port_name] = (instance_name, status[instance.state["Name"]])
422 except KeyError as e:
423 if options.get("--original-action") == "list-status":
424 logger.error("Unknown status \"{}\" returned for {} ({})".format(
425 instance.state["Name"], instance.id, instance_name))
426 result[port_name] = (instance_name, "unknown")
427 except ClientError:
428 fail_usage("Failed: Incorrect Access Key or Secret Key.")
429 except EndpointConnectionError:
430 fail_usage("Failed: Incorrect Region.")
431 except ConnectionError as e:
432 fail_usage("Failed: Unable to connect to AWS: " + str(e))
433 except Exception as e:
434 logger.error("Failed to get node list: %s", e)
435 logger.debug("Monitor operation OK: %s",result)
436 return result
437
438 def check_eni_ebs_target_is_dead(conn, options):
439 """
440 For ENI/EBS modes, when resolve returns None, check whether
441 the resource exists but is simply not attached (target dead)
442 vs a real error (resource doesn't exist, API failure, etc.).
443
444 Returns True if the target is confirmed dead, False otherwise.
445 """
446 identity_method = options.get("--identity-method", "instance-id")
447 plug_value = options.get("--plug")
448
449 if identity_method == "eni":
450 try:
451 client = conn.meta.client
452 response = client.describe_network_interfaces(NetworkInterfaceIds=[plug_value])
453 enis = response.get('NetworkInterfaces', [])
454 if enis and not enis[0].get('Attachment'):
455 logger.info("ENI %s exists but not attached. Target confirmed dead.", plug_value)
456 return True
457 except ClientError as e:
458 error_code = e.response.get('Error', {}).get('Code', '')
459 if error_code == 'InvalidNetworkInterfaceID.NotFound':
460 logger.error("ENI %s does not exist. Configuration error.", plug_value)
461 else:
462 logger.error("AWS API error during ENI dead-target check [%s]: %s", error_code, e)
463 return False
464
465 elif identity_method == "ebs":
466 try:
467 client = conn.meta.client
468 response = client.describe_volumes(VolumeIds=[plug_value])
469 volumes = response.get('Volumes', [])
470 if volumes and not volumes[0].get('Attachments'):
471 logger.info("Volume %s exists but not attached. Target confirmed dead.", plug_value)
472 return True
473 except ClientError as e:
474 error_code = e.response.get('Error', {}).get('Code', '')
475 if error_code == 'InvalidVolume.NotFound':
476 logger.error("Volume %s does not exist. Configuration error.", plug_value)
477 else:
478 logger.error("AWS API error during EBS dead-target check [%s]: %s", error_code, e)
479 return False
480
481 return False
482
483
484 def get_power_status(conn, options):
485 logger.debug("Starting status operation")
486 try:
487 instance_id = resolve_plug_to_instance_id(conn, options)
488 if not instance_id:
489 # The fencing library learns target state only through this function
490 # (fence_action pre-check and the post-off status poll). For tag/eni/ebs
491 # identity a terminated instance no longer resolves, so the
492 # confirmed-dead-vs-unknown decision must be made here: report OFF only
493 # when the target is positively confirmed dead, otherwise fail.
494 identity_method = options.get("--identity-method", "instance-id")
495
496 if identity_method in ("eni", "ebs"):
497 if check_eni_ebs_target_is_dead(conn, options):
498 logger.info("No fenceable instance for plug=%s — target confirmed dead (method: %s). Reporting OFF.",
499 options.get("--plug"), identity_method)
500 return "off"
501 elif check_tag_target_is_dead(conn, options):
502 logger.info("No fenceable instance for plug=%s — target confirmed dead. Reporting OFF.",
503 options.get("--plug"))
504 return "off"
505
506 logger.error("No instance resolved for plug=%s and target not confirmed dead. Reporting FAILED.",
507 options.get("--plug"))
508 fail(EC_STATUS)
509
510 instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
511 instance_list = list(instance)
512 if not instance_list:
513 logger.debug("Instance %s not found (likely terminated). Reporting OFF.", instance_id)
514 return "off"
515
516 state = instance_list[0].state["Name"]
517 logger.debug("Status operation for EC2 instance %s returned state: %s", instance_id, state.upper())
518 try:
519 return status[state]
520 except KeyError as e:
521 logger.error("Unknown status \"{}\" returned".format(state))
522 return "unknown"
523 except ClientError:
524 fail_usage("Failed: Incorrect Access Key or Secret Key.")
525 except EndpointConnectionError:
526 fail_usage("Failed: Incorrect Region.")
527 except IndexError:
528 logger.debug("Instance not found (IndexError). Reporting OFF.")
529 return "off"
530 except Exception as e:
531 logger.error("Failed to get power status: %s", e)
532 fail(EC_STATUS)
533
534 def get_self_power_status(conn, instance_id):
535 try:
536 instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
537 state = list(instance)[0].state["Name"]
538 if state == "running":
539 logger.debug("Captured my (%s) state and it %s - returning OK - Proceeding with fencing", instance_id, state.upper())
540 return "ok"
541 else:
542 logger.debug("Captured my (%s) state it is %s - returning Alert - Unable to fence other nodes", instance_id, state.upper())
543 return "alert"
544
545 except ClientError:
546 fail_usage("Failed: Incorrect Access Key or Secret Key.")
547 except EndpointConnectionError:
548 fail_usage("Failed: Incorrect Region.")
549 except IndexError:
550 return "fail"
551
552 def set_power_status(conn, options):
553 my_instance = options.get("my_instance_id") or get_instance_id(options)
554 try:
555 instance_id = resolve_plug_to_instance_id(conn, options)
556 if not instance_id:
557 # get_power_status() runs before and after this call and owns the
558 # confirmed-dead-vs-unknown decision (and the resolve result is cached
559 # within the invocation). If we reach here we could not resolve a target,
560 # so fail safe. Never report success here — that would be a false-positive
561 # fence. The status poll and Pacemaker's retry re-confirm the outcome.
562 logger.error("Could not resolve instance ID for plug=%s; failing safe.",
563 options.get("--plug"))
564 fail(EC_STATUS)
565
566 if options.get("--skip-os-shutdown", "true").lower() in ["1", "yes", "on", "true"]:
567 shutdown_option = {
568 "SkipOsShutdown": True,
569 "Force": True
570 }
571 else:
572 shutdown_option = {
573 "SkipOsShutdown": False,
574 "Force": True
575 }
576 if (options["--action"]=="off"):
577 if "--skip-race-check" in options or get_self_power_status(conn,my_instance) == "ok":
578 try:
579 conn.instances.filter(InstanceIds=[instance_id]).stop(**shutdown_option)
580 logger.info("Called StopInstance API call for %s", instance_id)
581 except ParamValidationError:
582 logger.warning("SkipOsShutdown not supported with the current boto3 version %s - falling back to graceful shutdown", boto3.__version__)
583 conn.instances.filter(InstanceIds=[instance_id]).stop(Force=True)
584 except ClientError as e:
585 error_code = e.response.get('Error', {}).get('Code', '')
586 if error_code in ('InvalidInstanceID.NotFound', 'IncorrectInstanceState'):
587 logger.info("Instance %s cannot be stopped (error: %s). Assuming already OFF.", instance_id, error_code)
588 else:
589 raise
590 else:
591 logger.warning("Skipping fencing as instance is not in running status")
592 elif (options["--action"]=="on"):
593 conn.instances.filter(InstanceIds=[instance_id]).start()
594 logger.info("Called StartInstance API call for %s", instance_id)
595 except Exception as e:
596 logger.error("Failed to power %s %s: %s", \
597 options["--action"], instance_id, e)
598 fail(EC_STATUS)
599
600 def define_new_opts():
601 all_opt["region"] = {
602 "getopt" : "r:",
603 "longopt" : "region",
604 "help" : "-r, --region=[region] Region, e.g. us-east-1",
605 "shortdesc" : "Region.",
606 "required" : "1",
607 "order" : 2
608 }
609 all_opt["access_key"] = {
610 "getopt" : "a:",
611 "longopt" : "access-key",
612 "help" : "-a, --access-key=[key] Access Key",
613 "shortdesc" : "Access Key.",
614 "required" : "0",
615 "order" : 3
616 }
617 all_opt["secret_key"] = {
618 "getopt" : "s:",
619 "longopt" : "secret-key",
620 "help" : "-s, --secret-key=[key] Secret Key",
621 "shortdesc" : "Secret Key.",
622 "required" : "0",
623 "order" : 4
624 }
625 all_opt["filter"] = {
626 "getopt" : ":",
627 "longopt" : "filter",
628 "help" : "--filter=[key=value] Filter (e.g. vpc-id=[vpc-XXYYZZAA])",
629 "shortdesc": "Filter for list-action",
630 "required": "0",
631 "order": 5
632 }
633 all_opt["boto3_debug"] = {
634 "getopt" : "b:",
635 "longopt" : "boto3_debug",
636 "help" : "-b, --boto3_debug=[option] Boto3 and Botocore library debug logging",
637 "shortdesc": "Boto Lib debug",
638 "required": "0",
639 "default": "False",
640 "order": 6
641 }
642 all_opt["skip_race_check"] = {
643 "getopt" : "",
644 "longopt" : "skip-race-check",
645 "help" : "--skip-race-check Skip race condition check",
646 "shortdesc": "Skip race condition check",
647 "required": "0",
648 "order": 7
649 }
650 all_opt["skip_os_shutdown"] = {
651 "getopt" : ":",
652 "longopt" : "skip-os-shutdown",
653 "help" : "--skip-os-shutdown=[true|false] Uses SkipOsShutdown flag",
654 "shortdesc" : "Use SkipOsShutdown flag to stop the EC2 instance",
655 "required" : "0",
656 "default" : "true",
657 "order" : 8
658 }
659 all_opt["tag"] = {
660 "getopt" : ":",
661 "longopt" : "tag",
662 "help" : "--tag=[tag_name] Tag name for instance lookup (e.g. 'Name'). When specified, --plug is treated as tag value instead of instance ID",
663 "shortdesc": "Tag name for instance identification",
664 "required": "0",
665 "order": 9
666 }
667 all_opt["identity_method"] = {
668 "getopt" : ":",
669 "longopt" : "identity-method",
670 "help" : "--identity-method=[method] Identity resolution method: instance-id (default), tag, eni, ebs",
671 "shortdesc": "How to resolve --plug to an instance ID. 'instance-id' treats plug as a direct instance ID, 'tag' uses EC2 tag lookup, 'eni' resolves via ENI attachment, 'ebs' resolves via EBS volume attachment.",
672 "required": "0",
673 "default": "instance-id",
674 "order": 10
675 }
676
677 def main():
678 conn = None
679
680 device_opt = ["port", "no_password", "region", "access_key", "secret_key", "filter", "boto3_debug", "skip_race_check", "skip_os_shutdown", "tag", "identity_method"]
681
682 atexit.register(atexit_handler)
683
684 define_new_opts()
685
686 all_opt["power_timeout"]["default"] = "60"
687
688 options = check_input(device_opt, process_input(device_opt))
689
690 docs = {}
691 docs["shortdesc"] = "Fence agent for AWS (Amazon Web Services) with multiple identity resolution methods"
692 docs["longdesc"] = "fence_aws is a Power Fencing agent for AWS (Amazon Web\
693 Services). It uses the boto3 library to connect to AWS.\
694 \n.P\n\
695 It supports four identity resolution methods via --identity-method:\
696 \n.P\n\
697 instance-id (default): --plug is treated as a direct EC2 instance ID.\
698 \n.P\n\
699 tag: --plug is treated as a tag value. Requires --tag to specify the tag name.\
700 For example: --identity-method=tag --tag=Name --plug=hostname\
701 \n.P\n\
702 eni: --plug is treated as an ENI ID. The agent resolves the ENI attachment to find\
703 the instance. Ideal for architectures with persistent ENIs that survive instance replacement.\
704 For example: --identity-method=eni --plug=eni-0a1b2c3d4e5f67890\
705 \n.P\n\
706 ebs: --plug is treated as an EBS volume ID. The agent resolves the volume attachment to\
707 find the instance. Ideal for architectures with persistent EBS volumes.\
708 For example: --identity-method=ebs --plug=vol-0a1b2c3d4e5f67890\
709 \n.P\n\
710 boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.\n\
711 For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration"
712 docs["vendorurl"] = "http://www.amazon.com"
713 show_docs(options, docs)
714
715 run_delay(options)
716
717 if "--debug-file" in options:
718 for handler in logger.handlers:
719 if isinstance(handler, logging.FileHandler):
720 logger.removeHandler(handler)
721 lh = logging.FileHandler(options["--debug-file"])
722 logger.addHandler(lh)
723 lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
724 lh.setFormatter(lhf)
725 lh.setLevel(logging.DEBUG)
726
727 if options["--boto3_debug"].lower() not in ["1", "yes", "on", "true"]:
728 boto3.set_stream_logger('boto3',logging.INFO)
729 boto3.set_stream_logger('botocore',logging.CRITICAL)
730 logging.getLogger('botocore').propagate = False
731 logging.getLogger('boto3').propagate = False
732 else:
733 log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
734 logging.getLogger('botocore').propagate = False
735 logging.getLogger('boto3').propagate = False
736 fdh = logging.FileHandler('/var/log/fence_aws_boto3.log')
737 fdh.setFormatter(log_format)
738 logging.getLogger('boto3').addHandler(fdh)
739 logging.getLogger('botocore').addHandler(fdh)
740 logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_boto3.log", options["--boto3_debug"])
741
742 region = options.get("--region")
743 access_key = options.get("--access-key")
744 secret_key = options.get("--secret-key")
745 try:
746 conn = boto3.resource('ec2', region_name=region,
747 aws_access_key_id=access_key,
748 aws_secret_access_key=secret_key)
749 except Exception as e:
750 if options.get("--action", "") not in ["metadata", "manpage", "validate-all"]:
751 fail_usage("Failed: Unable to connect to AWS: " + str(e))
752
753 # Cache own instance ID and build_number at startup
754 # These values never change during the instance's lifetime.
755 # Caching here eliminates IMDS calls from the fencing hot path.
756 options["my_instance_id"] = get_instance_id(options)
757 if options.get("my_instance_id"):
758 logger.debug("Cached own instance ID: %s", options["my_instance_id"])
759 try:
760 my_inst = list(conn.instances.filter(
761 Filters=[{"Name": "instance-id", "Values": [options["my_instance_id"]]}]))
762 if my_inst and my_inst[0].tags:
763 for tag in my_inst[0].tags:
764 if tag['Key'] == 'build_number':
765 options["my_build_number"] = tag['Value']
766 logger.debug("Cached own build_number: %s", options["my_build_number"])
767 break
768 except Exception as e:
769 logger.debug("Could not cache own build_number: %s", e)
770
771 result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list)
772 sys.exit(result)
773
774 if __name__ == "__main__":
775 main()
776