1    	#!@PYTHON@ -tt
2    	
3    	import sys, re
4    	import logging
5    	import atexit
6    	import time
7    	sys.path.append("@FENCEAGENTSLIBDIR@")
8    	from fencing import *
9    	from fencing import fail, fail_usage, run_delay, EC_STATUS, SyslogLibHandler
10   	
11   	import requests
12   	from requests import HTTPError
13   	
14   	try:
15   		import boto3
16   		from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError, ParamValidationError
17   	except ImportError:
18   		pass
19   	
20   	logger = logging.getLogger()
21   	logger.propagate = False
22   	logger.setLevel(logging.INFO)
23   	logger.addHandler(SyslogLibHandler())
24   	logging.getLogger('botocore.vendored').propagate = False
25   	
26   	status = {
27   			"running": "on",
28   			"stopped": "off",
29   			"pending": "unknown",
30   			"stopping": "off",
31   			"shutting-down": "off",
32   			"terminated": "off"
33   	}
34   	
35   	# IMDSv2 endpoints. Timeout is (connect, read) in seconds; the fence path must
36   	# never block on an unreachable metadata service.
37   	IMDS_TOKEN_URL = "http://169.254.169.254/latest/api/token"
38   	IMDS_META_URL = "http://169.254.169.254/latest/meta-data"
39   	IMDS_TIMEOUT = (2, 5)
40   	
41   	def _imds_fetch(path, options):
42   		"""Fetch a single IMDSv2 metadata path. Returns the value as str, or None on any failure."""
43   		try:
44   			token = requests.put(
CID (unavailable; MK=242552740d15d45fbb869d9cc0b90b81) (#1 of 1): Missing TLS (SIGMA.missing_tls):
(1) Event Sigma main event: The Python application creates a connection to the URL using the insecure HTTP protocol. As a result, application data is transmitted over an insecure channel where it can be read and modified by attackers.
(2) Event remediation: Modify the URL passed to the `requests` method to use the `https://` protocol.
45   				IMDS_TOKEN_URL,
46   				headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
47   				timeout=IMDS_TIMEOUT).content.decode("UTF-8")
48   			return requests.get(
49   				"{}/{}".format(IMDS_META_URL, path),
50   				headers={"X-aws-ec2-metadata-token": token},
51   				timeout=IMDS_TIMEOUT).content.decode("UTF-8")
52   		except HTTPError as http_err:
53   			logger.error("HTTP error accessing EC2 metadata (%s): %s", path, http_err)
54   		except Exception as err:
55   			if "--skip-race-check" not in options:
56   				logger.error("Error accessing EC2 metadata (%s): %s", path, err)
57   			else:
58   				logger.debug("Error accessing EC2 metadata (%s): %s", path, err)
59   		return None
60   	
61   	def get_instance_id(options):
62   		return _imds_fetch("instance-id", options)
63   	
64   	
65   	def get_instance_by_tag(conn, tag_name, tag_value, options, max_retries=3, retry_delay=2):
66   		"""
67   		Look up EC2 instance ID by tag name and value.
68   		Returns instance ID if found, None otherwise.
69   	
70   		Includes retry logic for AWS API eventual consistency.
71   	
72   		For blue/green deployments with multiple running instances:
73   		1. Get current instance's build_number tag (from cached values)
74   		2. Filter for non-terminated instances with matching tag
75   		3. Prefer instance with SAME build_number as current instance
76   		4. If no build_number match and multiple instances, REFUSE to guess
77   		"""
78   		last_error = None
79   	
80   		for attempt in range(1, max_retries + 1):
81   			try:
82   				region = options.get("--region")
83   				logger.debug("Looking up instance by tag %s=%s in region %s (attempt %d/%d)",
84   					tag_name, tag_value, region, attempt, max_retries)
85   	
86   				my_build_number = options.get("my_build_number")
87   	
88   				filters = [
89   					{"Name": "tag:{}".format(tag_name), "Values": [tag_value]},
90   					{"Name": "instance-state-name", "Values": ["pending", "running", "stopping", "stopped"]}
91   				]
92   				instances = list(conn.instances.filter(Filters=filters))
93   	
94   				if not instances:
95   					if attempt < max_retries:
96   						logger.debug("No instance found with tag %s=%s in fenceable states (attempt %d/%d), retrying in %ds",
97   							tag_name, tag_value, attempt, max_retries, retry_delay)
98   						time.sleep(retry_delay)
99   						continue
100  					logger.warning("No instance found with tag %s=%s in fenceable states after %d attempts",
101  						tag_name, tag_value, max_retries)
102  					return None
103  	
104  				if len(instances) > 1:
105  					logger.warning("Multiple running instances found with tag %s=%s:", tag_name, tag_value)
106  	
107  					instances_with_build = []
108  					for inst in instances:
109  						build_num = None
110  						if inst.tags:
111  							for tag in inst.tags:
112  								if tag['Key'] == 'build_number':
113  									build_num = tag['Value']
114  									break
115  						instances_with_build.append((inst, build_num))
116  						logger.warning("  - %s (build_number=%s, launched=%s)",
117  							inst.id, build_num if build_num else "N/A", inst.launch_time)
118  	
119  					selected_inst = None
120  					if my_build_number:
121  						for inst, build_num in instances_with_build:
122  							if build_num == my_build_number:
123  								selected_inst = inst
124  								logger.warning("Selecting instance with matching build_number=%s: %s",
125  									my_build_number, inst.id)
126  								break
127  	
128  					if not selected_inst:
129  						logger.error("Multiple instances match tag %s=%s but none match build_number=%s. "
130  							"Refusing to guess. Manual intervention required.",
131  							tag_name, tag_value, my_build_number)
132  						return None
133  	
134  					instance_id = selected_inst.id
135  				else:
136  					instance_id = instances[0].id
137  					logger.debug("Single instance found: %s", instance_id)
138  	
139  				logger.debug("Selected instance %s with tag %s=%s", instance_id, tag_name, tag_value)
140  				return instance_id
141  	
142  			except (ClientError, EndpointConnectionError, ConnectionError) as e:
143  				last_error = e
144  				if attempt < max_retries:
145  					logger.warning("AWS API error during tag lookup (attempt %d/%d): %s. Retrying in %ds",
146  						attempt, max_retries, e, retry_delay)
147  					time.sleep(retry_delay)
148  					continue
149  				logger.error("Failed to lookup instance by tag after %d attempts: %s", max_retries, e)
150  				return None
151  	
152  		logger.error("Failed to lookup instance by tag after %d attempts: %s", max_retries, last_error)
153  		return None
154  	
155  	
156  	def get_instance_by_eni(conn, eni_id, max_retries=3, retry_delay=2):
157  		"""
158  		Resolve ENI ID to attached instance ID.
159  		Returns (instance_id, None) on success or (None, error_msg) on failure.
160  	
161  		When the ENI exists but is not attached, the target instance is gone
162  		(terminated or being replaced). The caller decides the semantics:
163  		- get_power_status treats "not attached" as OFF (safe)
164  		- set_power_status fails safe when the target cannot be resolved
165  		"""
166  		last_error = None
167  	
168  		for attempt in range(1, max_retries + 1):
169  			try:
170  				client = conn.meta.client
171  				response = client.describe_network_interfaces(NetworkInterfaceIds=[eni_id])
172  				enis = response.get('NetworkInterfaces', [])
173  	
174  				if not enis:
175  					return (None, "ENI {} not found".format(eni_id))
176  	
177  				eni = enis[0]
178  				eni_status = eni.get('Status', 'unknown')
179  				attachment = eni.get('Attachment')
180  	
181  				if not attachment:
182  					return (None, "ENI {} exists but not attached (status: {})".format(eni_id, eni_status))
183  	
184  				attach_status = attachment.get('Status', 'unknown')
185  				if attach_status not in ('attached', 'attaching'):
186  					return (None, "ENI {} attachment in transitional state: {}".format(eni_id, attach_status))
187  	
188  				instance_id = attachment.get('InstanceId')
189  				if not instance_id:
190  					return (None, "ENI {} attached but no InstanceId in response".format(eni_id))
191  	
192  				logger.debug("ENI %s is attached to instance %s", eni_id, instance_id)
193  				return (instance_id, None)
194  	
195  			except ClientError as e:
196  				error_code = e.response.get('Error', {}).get('Code', '')
197  				if error_code == 'InvalidNetworkInterfaceID.NotFound':
198  					return (None, "ENI {} does not exist".format(eni_id))
199  				last_error = e
200  				if attempt < max_retries:
201  					logger.warning("AWS API error during ENI lookup (attempt %d/%d): %s. Retrying in %ds",
202  						attempt, max_retries, e, retry_delay)
203  					time.sleep(retry_delay)
204  					continue
205  				return (None, "AWS API error after {} attempts: {}".format(max_retries, e))
206  			except (EndpointConnectionError, ConnectionError) as e:
207  				last_error = e
208  				if attempt < max_retries:
209  					logger.warning("Connection error during ENI lookup (attempt %d/%d): %s. Retrying in %ds",
210  						attempt, max_retries, e, retry_delay)
211  					time.sleep(retry_delay)
212  					continue
213  				return (None, "Connection error after {} attempts: {}".format(max_retries, e))
214  	
215  		return (None, "ENI lookup failed after {} attempts: {}".format(max_retries, last_error))
216  	
217  	
218  	def get_instance_by_volume(conn, volume_id, max_retries=3, retry_delay=2):
219  		"""
220  		Resolve EBS volume ID to attached instance ID.
221  		Returns (instance_id, None) on success or (None, error_msg) on failure.
222  	
223  		When the volume exists but is not attached, the target instance is gone
224  		(terminated or being replaced). Same caller semantics as ENI resolution.
225  		"""
226  		last_error = None
227  	
228  		for attempt in range(1, max_retries + 1):
229  			try:
230  				client = conn.meta.client
231  				response = client.describe_volumes(VolumeIds=[volume_id])
232  				volumes = response.get('Volumes', [])
233  	
234  				if not volumes:
235  					return (None, "Volume {} not found".format(volume_id))
236  	
237  				volume = volumes[0]
238  				vol_state = volume.get('State', 'unknown')
239  				attachments = volume.get('Attachments', [])
240  	
241  				if not attachments:
242  					return (None, "Volume {} exists but not attached (state: {})".format(volume_id, vol_state))
243  	
244  				if len(attachments) > 1:
245  					return (None, "Volume {} has {} attachments (multi-attach). Cannot determine target.".format(
246  						volume_id, len(attachments)))
247  	
248  				attach = attachments[0]
249  				attach_status = attach.get('State', 'unknown')
250  				if attach_status not in ('attached', 'attaching'):
251  					return (None, "Volume {} attachment in transitional state: {}".format(volume_id, attach_status))
252  	
253  				instance_id = attach.get('InstanceId')
254  				if not instance_id:
255  					return (None, "Volume {} attached but no InstanceId in response".format(volume_id))
256  	
257  				logger.debug("Volume %s is attached to instance %s", volume_id, instance_id)
258  				return (instance_id, None)
259  	
260  			except ClientError as e:
261  				error_code = e.response.get('Error', {}).get('Code', '')
262  				if error_code == 'InvalidVolume.NotFound':
263  					return (None, "Volume {} does not exist".format(volume_id))
264  				last_error = e
265  				if attempt < max_retries:
266  					logger.warning("AWS API error during EBS lookup (attempt %d/%d): %s. Retrying in %ds",
267  						attempt, max_retries, e, retry_delay)
268  					time.sleep(retry_delay)
269  					continue
270  				return (None, "AWS API error after {} attempts: {}".format(max_retries, e))
271  			except (EndpointConnectionError, ConnectionError) as e:
272  				last_error = e
273  				if attempt < max_retries:
274  					logger.warning("Connection error during EBS lookup (attempt %d/%d): %s. Retrying in %ds",
275  						attempt, max_retries, e, retry_delay)
276  					time.sleep(retry_delay)
277  					continue
278  				return (None, "Connection error after {} attempts: {}".format(max_retries, e))
279  	
280  		return (None, "EBS lookup failed after {} attempts: {}".format(max_retries, last_error))
281  	
282  	
283  	def resolve_plug_to_instance_id(conn, options):
284  		"""
285  		Resolve the --plug parameter to an instance ID.
286  	
287  		Dispatches by --identity-method:
288  		  instance-id (default): --plug is the instance ID directly
289  		  tag:                   --plug is a tag value, resolved via DescribeInstances
290  		  eni:                   --plug is an ENI ID, resolved via DescribeNetworkInterfaces
291  		  ebs:                   --plug is a volume ID, resolved via DescribeVolumes
292  	
293  		Uses a per-invocation cache to minimise AWS control plane calls.
294  		The identity -> instance ID mapping is resolved ONCE and reused for all
295  		subsequent calls within the same fence operation (status checks,
296  		power actions, polling).
297  	
298  		The cache is safe because:
299  		- The instance ID cannot change while the instance is being fenced
300  		- The agent process is short-lived (one fence operation per invocation)
301  		- If the cached ID becomes invalid (instance terminated between calls),
302  		  the StopInstances call fails gracefully with InvalidInstanceID.NotFound
303  		"""
304  		plug_value = options.get("--plug")
305  		identity_method = options.get("--identity-method", "instance-id")
306  	
307  		if not plug_value:
308  			logger.error("No --plug parameter provided")
309  			return None
310  	
311  		cache_key = "cached_instance_id"
312  		cached = options.get(cache_key)
313  		if cached:
314  			logger.debug("Using cached instance ID %s for plug=%s", cached, plug_value)
315  			return cached
316  	
317  		instance_id = None
318  	
319  		if identity_method == "eni":
320  			logger.debug("ENI-based lookup: %s", plug_value)
321  			instance_id, error = get_instance_by_eni(conn, plug_value)
322  			if error:
323  				logger.error("ENI resolution failed: %s", error)
324  				return None
325  	
326  		elif identity_method == "ebs":
327  			logger.debug("EBS-based lookup: %s", plug_value)
328  			instance_id, error = get_instance_by_volume(conn, plug_value)
329  			if error:
330  				logger.error("EBS resolution failed: %s", error)
331  				return None
332  	
333  		elif identity_method == "tag" or options.get("--tag"):
334  			tag_name = options.get("--tag", "Name")
335  			logger.debug("Tag-based lookup: %s=%s", tag_name, plug_value)
336  			instance_id = get_instance_by_tag(conn, tag_name, plug_value, options)
337  			if not instance_id:
338  				logger.error("Failed to find instance with tag %s=%s", tag_name, plug_value)
339  	
340  		else:
341  			logger.debug("Direct instance ID: %s", plug_value)
342  			instance_id = plug_value
343  	
344  		if instance_id:
345  			options[cache_key] = instance_id
346  			logger.debug("Resolved plug=%s to instance %s (method: %s)",
347  				plug_value, instance_id, identity_method)
348  	
349  		return instance_id
350  	
351  	
352  	def check_tag_target_is_dead(conn, options):
353  		"""
354  		When tag lookup returns no fenceable instances, determine whether the
355  		target is genuinely dead (all terminated) or if the lookup failed for
356  		other reasons (wrong tag, API error, etc.).
357  	
358  		Returns True if the target is confirmed dead, False otherwise.
359  		"""
360  		tag_name = options.get("--tag")
361  		plug_value = options.get("--plug")
362  	
363  		if not tag_name:
364  			return False
365  	
366  		try:
367  			all_states = list(conn.instances.filter(Filters=[
368  				{"Name": "tag:{}".format(tag_name), "Values": [plug_value]}
369  			]))
370  	
371  			if all_states and all(i.state["Name"] in ("terminated", "shutting-down") for i in all_states):
372  				logger.info("All instances with tag %s=%s are terminated/shutting-down. Target confirmed dead.",
373  					tag_name, plug_value)
374  				return True
375  	
376  			if not all_states:
377  				logger.error("No instance has ever existed with tag %s=%s. This is a configuration error.",
378  					tag_name, plug_value)
379  				return False
380  	
381  			live_states = [i.state["Name"] for i in all_states if i.state["Name"] not in ("terminated", "shutting-down")]
382  			logger.error("Instances with tag %s=%s exist in unexpected states: %s. Cannot confirm target is dead.",
383  				tag_name, plug_value, live_states)
384  			return False
385  	
386  		except (ClientError, EndpointConnectionError, ConnectionError) as e:
387  			logger.error("AWS API error during dead-target check: %s", e)
388  			return False
389  	
390  	
391  	def get_nodes_list(conn, options):
392  		logger.debug("Starting monitor operation")
393  		result = {}
394  		filter = []
395  		try:
396  			tag_name = options.get("--tag")
397  	
398  			if "--filter" in options:
399  				filter_key   = options["--filter"].split("=")[0].strip()
400  				filter_value = options["--filter"].split("=")[1].strip()
401  				filter = [{ "Name": filter_key, "Values": [filter_value] }]
402  				logging.debug("Filter: {}".format(filter))
403  	
404  			for instance in conn.instances.filter(Filters=filter):
405  				instance_name = ""
406  				for tag in instance.tags or []:
407  					if tag.get("Key") == "Name":
408  						instance_name = tag["Value"]
409  						break
410  	
411  				port_name = instance.id
412  				if tag_name and instance.tags:
413  					for tag in instance.tags:
414  						if tag['Key'] == tag_name:
415  							port_name = tag['Value']
416  							logger.debug("Mapped instance %s to port name %s via tag %s",
417  								instance.id, port_name, tag_name)
418  							break
419  	
420  				try:
421  					result[port_name] = (instance_name, status[instance.state["Name"]])
422  				except KeyError as e:
423  					if options.get("--original-action") == "list-status":
424  						logger.error("Unknown status \"{}\" returned for {} ({})".format(
425  							instance.state["Name"], instance.id, instance_name))
426  					result[port_name] = (instance_name, "unknown")
427  		except ClientError:
428  			fail_usage("Failed: Incorrect Access Key or Secret Key.")
429  		except EndpointConnectionError:
430  			fail_usage("Failed: Incorrect Region.")
431  		except ConnectionError as e:
432  			fail_usage("Failed: Unable to connect to AWS: " + str(e))
433  		except Exception as e:
434  			logger.error("Failed to get node list: %s", e)
435  		logger.debug("Monitor operation OK: %s",result)
436  		return result
437  	
438  	def check_eni_ebs_target_is_dead(conn, options):
439  		"""
440  		For ENI/EBS modes, when resolve returns None, check whether
441  		the resource exists but is simply not attached (target dead)
442  		vs a real error (resource doesn't exist, API failure, etc.).
443  	
444  		Returns True if the target is confirmed dead, False otherwise.
445  		"""
446  		identity_method = options.get("--identity-method", "instance-id")
447  		plug_value = options.get("--plug")
448  	
449  		if identity_method == "eni":
450  			try:
451  				client = conn.meta.client
452  				response = client.describe_network_interfaces(NetworkInterfaceIds=[plug_value])
453  				enis = response.get('NetworkInterfaces', [])
454  				if enis and not enis[0].get('Attachment'):
455  					logger.info("ENI %s exists but not attached. Target confirmed dead.", plug_value)
456  					return True
457  			except ClientError as e:
458  				error_code = e.response.get('Error', {}).get('Code', '')
459  				if error_code == 'InvalidNetworkInterfaceID.NotFound':
460  					logger.error("ENI %s does not exist. Configuration error.", plug_value)
461  				else:
462  					logger.error("AWS API error during ENI dead-target check [%s]: %s", error_code, e)
463  			return False
464  	
465  		elif identity_method == "ebs":
466  			try:
467  				client = conn.meta.client
468  				response = client.describe_volumes(VolumeIds=[plug_value])
469  				volumes = response.get('Volumes', [])
470  				if volumes and not volumes[0].get('Attachments'):
471  					logger.info("Volume %s exists but not attached. Target confirmed dead.", plug_value)
472  					return True
473  			except ClientError as e:
474  				error_code = e.response.get('Error', {}).get('Code', '')
475  				if error_code == 'InvalidVolume.NotFound':
476  					logger.error("Volume %s does not exist. Configuration error.", plug_value)
477  				else:
478  					logger.error("AWS API error during EBS dead-target check [%s]: %s", error_code, e)
479  			return False
480  	
481  		return False
482  	
483  	
484  	def get_power_status(conn, options):
485  		logger.debug("Starting status operation")
486  		try:
487  			instance_id = resolve_plug_to_instance_id(conn, options)
488  			if not instance_id:
489  				# The fencing library learns target state only through this function
490  				# (fence_action pre-check and the post-off status poll). For tag/eni/ebs
491  				# identity a terminated instance no longer resolves, so the
492  				# confirmed-dead-vs-unknown decision must be made here: report OFF only
493  				# when the target is positively confirmed dead, otherwise fail.
494  				identity_method = options.get("--identity-method", "instance-id")
495  	
496  				if identity_method in ("eni", "ebs"):
497  					if check_eni_ebs_target_is_dead(conn, options):
498  						logger.info("No fenceable instance for plug=%s — target confirmed dead (method: %s). Reporting OFF.",
499  							options.get("--plug"), identity_method)
500  						return "off"
501  				elif check_tag_target_is_dead(conn, options):
502  					logger.info("No fenceable instance for plug=%s — target confirmed dead. Reporting OFF.",
503  						options.get("--plug"))
504  					return "off"
505  	
506  				logger.error("No instance resolved for plug=%s and target not confirmed dead. Reporting FAILED.",
507  					options.get("--plug"))
508  				fail(EC_STATUS)
509  	
510  			instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
511  			instance_list = list(instance)
512  			if not instance_list:
513  				logger.debug("Instance %s not found (likely terminated). Reporting OFF.", instance_id)
514  				return "off"
515  	
516  			state = instance_list[0].state["Name"]
517  			logger.debug("Status operation for EC2 instance %s returned state: %s", instance_id, state.upper())
518  			try:
519  				return status[state]
520  			except KeyError as e:
521  				logger.error("Unknown status \"{}\" returned".format(state))
522  				return "unknown"
523  		except ClientError:
524  			fail_usage("Failed: Incorrect Access Key or Secret Key.")
525  		except EndpointConnectionError:
526  			fail_usage("Failed: Incorrect Region.")
527  		except IndexError:
528  			logger.debug("Instance not found (IndexError). Reporting OFF.")
529  			return "off"
530  		except Exception as e:
531  			logger.error("Failed to get power status: %s", e)
532  			fail(EC_STATUS)
533  	
534  	def get_self_power_status(conn, instance_id):
535  		try:
536  			instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
537  			state = list(instance)[0].state["Name"]
538  			if state == "running":
539  				logger.debug("Captured my (%s) state and it %s - returning OK - Proceeding with fencing", instance_id, state.upper())
540  				return "ok"
541  			else:
542  				logger.debug("Captured my (%s) state it is %s - returning Alert - Unable to fence other nodes", instance_id, state.upper())
543  				return "alert"
544  	
545  		except ClientError:
546  			fail_usage("Failed: Incorrect Access Key or Secret Key.")
547  		except EndpointConnectionError:
548  			fail_usage("Failed: Incorrect Region.")
549  		except IndexError:
550  			return "fail"
551  	
552  	def set_power_status(conn, options):
553  		my_instance = options.get("my_instance_id") or get_instance_id(options)
554  		try:
555  			instance_id = resolve_plug_to_instance_id(conn, options)
556  			if not instance_id:
557  				# get_power_status() runs before and after this call and owns the
558  				# confirmed-dead-vs-unknown decision (and the resolve result is cached
559  				# within the invocation). If we reach here we could not resolve a target,
560  				# so fail safe. Never report success here — that would be a false-positive
561  				# fence. The status poll and Pacemaker's retry re-confirm the outcome.
562  				logger.error("Could not resolve instance ID for plug=%s; failing safe.",
563  					options.get("--plug"))
564  				fail(EC_STATUS)
565  	
566  			if options.get("--skip-os-shutdown", "true").lower() in ["1", "yes", "on", "true"]:
567  				shutdown_option = {
568  					"SkipOsShutdown": True,
569  					"Force": True
570  				}
571  			else:
572  				shutdown_option = {
573  					"SkipOsShutdown": False,
574  					"Force": True
575  				}
576  			if (options["--action"]=="off"):
577  				if "--skip-race-check" in options or get_self_power_status(conn,my_instance) == "ok":
578  					try:
579  						conn.instances.filter(InstanceIds=[instance_id]).stop(**shutdown_option)
580  						logger.info("Called StopInstance API call for %s", instance_id)
581  					except ParamValidationError:
582  						logger.warning("SkipOsShutdown not supported with the current boto3 version %s - falling back to graceful shutdown", boto3.__version__)
583  						conn.instances.filter(InstanceIds=[instance_id]).stop(Force=True)
584  					except ClientError as e:
585  						error_code = e.response.get('Error', {}).get('Code', '')
586  						if error_code in ('InvalidInstanceID.NotFound', 'IncorrectInstanceState'):
587  							logger.info("Instance %s cannot be stopped (error: %s). Assuming already OFF.", instance_id, error_code)
588  						else:
589  							raise
590  				else:
591  					logger.warning("Skipping fencing as instance is not in running status")
592  			elif (options["--action"]=="on"):
593  				conn.instances.filter(InstanceIds=[instance_id]).start()
594  				logger.info("Called StartInstance API call for %s", instance_id)
595  		except Exception as e:
596  			logger.error("Failed to power %s %s: %s", \
597  					options["--action"], instance_id, e)
598  			fail(EC_STATUS)
599  	
600  	def define_new_opts():
601  		all_opt["region"] = {
602  			"getopt" : "r:",
603  			"longopt" : "region",
604  			"help" : "-r, --region=[region]          Region, e.g. us-east-1",
605  			"shortdesc" : "Region.",
606  			"required" : "1",
607  			"order" : 2
608  		}
609  		all_opt["access_key"] = {
610  			"getopt" : "a:",
611  			"longopt" : "access-key",
612  			"help" : "-a, --access-key=[key]         Access Key",
613  			"shortdesc" : "Access Key.",
614  			"required" : "0",
615  			"order" : 3
616  		}
617  		all_opt["secret_key"] = {
618  			"getopt" : "s:",
619  			"longopt" : "secret-key",
620  			"help" : "-s, --secret-key=[key]         Secret Key",
621  			"shortdesc" : "Secret Key.",
622  			"required" : "0",
623  			"order" : 4
624  		}
625  		all_opt["filter"] = {
626  			"getopt" : ":",
627  			"longopt" : "filter",
628  			"help" : "--filter=[key=value]           Filter (e.g. vpc-id=[vpc-XXYYZZAA])",
629  			"shortdesc": "Filter for list-action",
630  			"required": "0",
631  			"order": 5
632  		}
633  		all_opt["boto3_debug"] = {
634  			"getopt" : "b:",
635  			"longopt" : "boto3_debug",
636  			"help" : "-b, --boto3_debug=[option]     Boto3 and Botocore library debug logging",
637  			"shortdesc": "Boto Lib debug",
638  			"required": "0",
639  			"default": "False",
640  			"order": 6
641  		}
642  		all_opt["skip_race_check"] = {
643  			"getopt" : "",
644  			"longopt" : "skip-race-check",
645  			"help" : "--skip-race-check              Skip race condition check",
646  			"shortdesc": "Skip race condition check",
647  			"required": "0",
648  			"order": 7
649  		}
650  		all_opt["skip_os_shutdown"] = {
651  			"getopt" : ":",
652  			"longopt" : "skip-os-shutdown",
653  			"help" : "--skip-os-shutdown=[true|false]    Uses SkipOsShutdown flag",
654  			"shortdesc" : "Use SkipOsShutdown flag to stop the EC2 instance",
655  			"required" : "0",
656  			"default" : "true",
657  			"order" : 8
658  		}
659  		all_opt["tag"] = {
660  			"getopt" : ":",
661  			"longopt" : "tag",
662  			"help" : "--tag=[tag_name]               Tag name for instance lookup (e.g. 'Name'). When specified, --plug is treated as tag value instead of instance ID",
663  			"shortdesc": "Tag name for instance identification",
664  			"required": "0",
665  			"order": 9
666  		}
667  		all_opt["identity_method"] = {
668  			"getopt" : ":",
669  			"longopt" : "identity-method",
670  			"help" : "--identity-method=[method]     Identity resolution method: instance-id (default), tag, eni, ebs",
671  			"shortdesc": "How to resolve --plug to an instance ID. 'instance-id' treats plug as a direct instance ID, 'tag' uses EC2 tag lookup, 'eni' resolves via ENI attachment, 'ebs' resolves via EBS volume attachment.",
672  			"required": "0",
673  			"default": "instance-id",
674  			"order": 10
675  		}
676  	
677  	def main():
678  		conn = None
679  	
680  		device_opt = ["port", "no_password", "region", "access_key", "secret_key", "filter", "boto3_debug", "skip_race_check", "skip_os_shutdown", "tag", "identity_method"]
681  	
682  		atexit.register(atexit_handler)
683  	
684  		define_new_opts()
685  	
686  		all_opt["power_timeout"]["default"] = "60"
687  	
688  		options = check_input(device_opt, process_input(device_opt))
689  	
690  		docs = {}
691  		docs["shortdesc"] = "Fence agent for AWS (Amazon Web Services) with multiple identity resolution methods"
692  		docs["longdesc"] = "fence_aws is a Power Fencing agent for AWS (Amazon Web\
693  	Services). It uses the boto3 library to connect to AWS.\
694  	\n.P\n\
695  	It supports four identity resolution methods via --identity-method:\
696  	\n.P\n\
697  	instance-id (default): --plug is treated as a direct EC2 instance ID.\
698  	\n.P\n\
699  	tag: --plug is treated as a tag value. Requires --tag to specify the tag name.\
700  	For example: --identity-method=tag --tag=Name --plug=hostname\
701  	\n.P\n\
702  	eni: --plug is treated as an ENI ID. The agent resolves the ENI attachment to find\
703  	the instance. Ideal for architectures with persistent ENIs that survive instance replacement.\
704  	For example: --identity-method=eni --plug=eni-0a1b2c3d4e5f67890\
705  	\n.P\n\
706  	ebs: --plug is treated as an EBS volume ID. The agent resolves the volume attachment to\
707  	find the instance. Ideal for architectures with persistent EBS volumes.\
708  	For example: --identity-method=ebs --plug=vol-0a1b2c3d4e5f67890\
709  	\n.P\n\
710  	boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.\n\
711  	For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration"
712  		docs["vendorurl"] = "http://www.amazon.com"
713  		show_docs(options, docs)
714  	
715  		run_delay(options)
716  	
717  		if "--debug-file" in options:
718  			for handler in logger.handlers:
719  				if isinstance(handler, logging.FileHandler):
720  					logger.removeHandler(handler)
721  			lh = logging.FileHandler(options["--debug-file"])
722  			logger.addHandler(lh)
723  			lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
724  			lh.setFormatter(lhf)
725  			lh.setLevel(logging.DEBUG)
726  	
727  		if options["--boto3_debug"].lower() not in ["1", "yes", "on", "true"]:
728  			boto3.set_stream_logger('boto3',logging.INFO)
729  			boto3.set_stream_logger('botocore',logging.CRITICAL)
730  			logging.getLogger('botocore').propagate = False
731  			logging.getLogger('boto3').propagate = False
732  		else:
733  			log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
734  			logging.getLogger('botocore').propagate = False
735  			logging.getLogger('boto3').propagate = False
736  			fdh = logging.FileHandler('/var/log/fence_aws_boto3.log')
737  			fdh.setFormatter(log_format)
738  			logging.getLogger('boto3').addHandler(fdh)
739  			logging.getLogger('botocore').addHandler(fdh)
740  			logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_boto3.log", options["--boto3_debug"])
741  	
742  		region = options.get("--region")
743  		access_key = options.get("--access-key")
744  		secret_key = options.get("--secret-key")
745  		try:
746  			conn = boto3.resource('ec2', region_name=region,
747  					      aws_access_key_id=access_key,
748  					      aws_secret_access_key=secret_key)
749  		except Exception as e:
750  			if options.get("--action", "") not in ["metadata", "manpage", "validate-all"]:
751  				fail_usage("Failed: Unable to connect to AWS: " + str(e))
752  	
753  		# Cache own instance ID and build_number at startup
754  		# These values never change during the instance's lifetime.
755  		# Caching here eliminates IMDS calls from the fencing hot path.
756  		options["my_instance_id"] = get_instance_id(options)
757  		if options.get("my_instance_id"):
758  			logger.debug("Cached own instance ID: %s", options["my_instance_id"])
759  			try:
760  				my_inst = list(conn.instances.filter(
761  					Filters=[{"Name": "instance-id", "Values": [options["my_instance_id"]]}]))
762  				if my_inst and my_inst[0].tags:
763  					for tag in my_inst[0].tags:
764  						if tag['Key'] == 'build_number':
765  							options["my_build_number"] = tag['Value']
766  							logger.debug("Cached own build_number: %s", options["my_build_number"])
767  							break
768  			except Exception as e:
769  				logger.debug("Could not cache own build_number: %s", e)
770  	
771  		result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list)
772  		sys.exit(result)
773  	
774  	if __name__ == "__main__":
775  		main()
776